# read in docs
import os
from glob import glob
import numpy as np
import pandas as pd
from textparser import TextParser
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
from langmod import NgramCounter
from langmod import NgramLanguageModel
import itertools
import seaborn as sns
import plotly.express as px
from numpy.linalg import norm
from scipy.spatial.distance import pdist
import scipy.cluster.hierarchy as sch
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from bow_tfidf_pca import create_bow, get_tfidf, get_pca
from prince import PCA
sns.set()
OHCO = ["book_id", "chap_id", "para_num", "sent_num", "token_num"]
SENTS = OHCO[:4]
PARAS = OHCO[:3]
CHAPS = OHCO[:2]
BOOKS = OHCO[:1]
LIB = pd.read_csv("dickens_pre_LIB.csv").set_index(BOOKS).sort_index()
CORPUS = pd.read_csv("dickens_pre_CORPUS.csv").set_index(OHCO)
# remove NaN values
CORPUS = CORPUS[~CORPUS.term_str.isna()]
VOCAB = pd.read_csv("dickens_pre_VOCAB.csv")
VOCAB['term_str'] = VOCAB['term_str'].astype('str')
VOCAB = VOCAB.set_index('term_str')
VOCAB
| n | n_chars | p | i | max_pos | n_pos | cat_pos | stop | stem_porter | stem_snowball | stem_lancaster | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| term_str | |||||||||||
| 0 | 60 | 1 | 1.207251e-05 | 16.337915 | CD | 4 | {'RB', 'CD', 'NN', 'JJ'} | 0 | 0 | 0 | 0 |
| 1 | 38 | 1 | 7.645923e-06 | 16.996878 | CD | 5 | {'NNP', 'CD', 'VB', 'NN', 'JJ'} | 0 | 1 | 1 | 1 |
| 10 | 8 | 2 | 1.609668e-06 | 19.244805 | CD | 4 | {'NNP', 'IN', 'CD', 'NN'} | 0 | 10 | 10 | 10 |
| 100 | 4 | 3 | 8.048340e-07 | 20.244805 | CD | 4 | {'JJ', 'IN', 'CD', 'NN'} | 0 | 100 | 100 | 100 |
| 1000 | 1 | 4 | 2.012085e-07 | 22.244805 | JJ | 1 | {'JJ'} | 0 | 1000 | 1000 | 1000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| æolian | 2 | 6 | 4.024170e-07 | 21.244805 | JJ | 1 | {'JJ'} | 0 | æolian | æolian | æol |
| æsop | 1 | 4 | 2.012085e-07 | 22.244805 | NN | 1 | {'NN'} | 0 | æsop | æsop | æsop |
| éclat | 1 | 5 | 2.012085e-07 | 22.244805 | NN | 1 | {'NN'} | 0 | éclat | éclat | éclat |
| élite | 1 | 5 | 2.012085e-07 | 22.244805 | NN | 1 | {'NN'} | 0 | élite | élite | élit |
| ěngine | 1 | 6 | 2.012085e-07 | 22.244805 | NNP | 1 | {'NNP'} | 0 | ěngine | ěngine | ěngine |
55272 rows × 11 columns
LIB
| source_file_path | title | chap_regex | author | type | year | decade | n_chaps | book_len | |
|---|---|---|---|---|---|---|---|---|---|
| book_id | |||||||||
| 98 | Dickens/98-a_tale_of_two_cities.txt | a tale of two cities | ^\s*CHAPTER\s*[IVXLCM]+\.$ | dickens | novel | 1859 | 1850 | 45 | 137089 |
| 564 | Dickens/564-the_mystery_of_edwin_drood.txt | the mystery of edwin drood | ^CHAPTER\s[IVXLCM]+\.$ | dickens | novel | 1870 | 1870 | 23 | 96378 |
| 580 | Dickens/580-the_pickwick_papers.txt | the pickwick papers | ^CHAPTER\s[IVXLCM]+\.\s[A-Z]+ | dickens | novel | 1836 | 1830 | 57 | 302570 |
| 588 | Dickens/588-master_humphreys_clock.txt | master humphreys clock | ^(?:[IVXLCM]+$|TO THE READERS OF) | dickens | stories | 1840 | 1840 | 7 | 47084 |
| 644 | Dickens/644-the_haunted_man_and_the_ghosts_bar... | the haunted man and the ghosts bargain | ^CHAPTER\s[IVXLCM]+$ | dickens | stories | 1848 | 1840 | 3 | 33904 |
| 650 | Dickens/650-pictures_from_italy.txt | pictures from italy | THE READER’S PASSPORT|GOING THROUGH FRANCE|LYO... | dickens | non-fiction | 1846 | 1840 | 11 | 73007 |
| 653 | Dickens/653-the_chimes.txt | the chimes | ^CHAPTER\s[IVXLCM]+ | dickens | novel | 1844 | 1840 | 4 | 30742 |
| 675 | Dickens/675-american_notes.txt | american notes | ^CHAPTER\s[IVXLCM]+$ | dickens | non-fiction | 1842 | 1840 | 18 | 103305 |
| 676 | Dickens/676-the_battle_of_life.txt | the battle of life | ^Part the [A-Z][a-z]+$ | dickens | novel | 1846 | 1840 | 3 | 29679 |
| 699 | Dickens/699-a_childs_history_of_england.txt | a childs history of england | ^CHAPTER\s[IVXLCM]+$ | dickens | non-fiction | 1853 | 1850 | 37 | 163271 |
| 700 | Dickens/700-the_old_curiosity_shop.txt | the old curiosity shop | ^CHAPTER\s | dickens | novel | 1840 | 1840 | 73 | 218719 |
| 730 | Dickens/730-oliver_twist.txt | oliver twist | ^\s*CHAPTER\s*[IVXLCM]+\.$ | dickens | novel | 1837 | 1830 | 53 | 158280 |
| 766 | Dickens/766-david_copperfield.txt | david copperfield | \s*(PREFACE\sTO|CHAPTER\s*[0-9]*) | dickens | novel | 1849 | 1840 | 66 | 358375 |
| 786 | Dickens/786-hard_times.txt | hard times | CHAPTER\s[IVXLCM]+ | dickens | novel | 1854 | 1850 | 16 | 75760 |
| 807 | Dickens/807-hunted_down.txt | hunted down | ^[IVXLCM]+\.$ | dickens | stories | 1859 | 1850 | 5 | 8670 |
| 809 | Dickens/809-holiday_romance.txt | holiday romance | ^PART\s[IVXLCM]+\.$ | dickens | stories | 1868 | 1860 | 4 | 13315 |
| 810 | Dickens/810-george_silvermans_explanation.txt | george silvermans explanation | [A-Z]+\sCHAPTER$ | dickens | stories | 1868 | 1860 | 9 | 11065 |
| 821 | Dickens/821-dombey_and_sons.txt | dombey and sons | ^\s*CHAPTER\s*[IVXLCM]+\.$ | dickens | novel | 1846 | 1840 | 62 | 356382 |
| 824 | Dickens/824-speeches_of_charles_dickens.txt | speeches of charles dickens | [IVXLCM]+\.$ | dickens | non-fiction | 1870 | 1870 | 58 | 87984 |
| 872 | Dickens/872-reprinted_pieces.txt | reprinted pieces | THE LONG VOYAGE$|THE BEGGING-LETTER WRITER$|A ... | dickens | stories | 1861 | 1860 | 23 | 91924 |
| 882 | Dickens/882-sketches_by_boz.txt | sketches by boz | ^(PREFACE|CHAPTER\s[IVXLCM]+) | dickens | stories | 1836 | 1830 | 57 | 184201 |
| 883 | Dickens/883-our_mutual_friend.txt | our mutual friend | ^\s*Chapter\s* | dickens | novel | 1864 | 1860 | 67 | 328190 |
| 888 | Dickens/888-the_lazy_tour_of_two_idle_apprenti... | the lazy tour of two idle apprentices | CHAPTER\s[IVXLCM]+$ | dickens | stories | 1857 | 1850 | 5 | 40510 |
| 912 | Dickens/912-the_mudfog_and_other_sketches.txt | the mudfog and other sketches | PUBLIC LIFE OF MR. TULRUMBLE$|FULL REPORT OF T... | dickens | stories | 1837 | 1830 | 7 | 30917 |
| 914 | Dickens/914-the_uncommerical_traveller.txt | the uncommerical traveller | ^[IVXLCM]+$ | dickens | non-fiction | 1860 | 1860 | 37 | 144157 |
| 916 | Dickens/916-sketches_of_young_couples.txt | sketches of young couples | AN URGENT REMONSTRANCE, &C.$|THE YOUNG COUPLE$... | dickens | stories | 1840 | 1840 | 12 | 18082 |
| 917 | Dickens/917-barnaby_rudge.txt | barnaby rudge | ^Chapter\s([0-9]+|the Last) | dickens | stories | 1841 | 1840 | 82 | 255400 |
| 918 | Dickens/918-sketches_of_young_gentlemen.txt | sketches of young gentlemen | THE BASHFUL YOUNG GENTLEMAN$|THE OUT-AND-OUT Y... | dickens | stories | 1838 | 1830 | 13 | 17063 |
| 922 | Dickens/922-sunday_under_three_heads.txt | sunday under three heads | ^[IVXLCM]+$ | dickens | non-fiction | 1836 | 1830 | 3 | 10767 |
| 927 | Dickens/927-the_lamplighter.txt | the lamplighter | ^‘IF | dickens | stories | 1838 | 1830 | 1 | 6952 |
| 967 | Dickens/967-nicholas_nickleby.txt | nicholas nickleby | ^(AUTHOR’S PREFACE|CHAPTER\s[0-9]+|Conclusion$) | dickens | novel | 1838 | 1830 | 66 | 326224 |
| 968 | Dickens/968-martin_chuzzlewit.txt | martin chuzzlewit | ^(PREFACE|CHAPTER\s[A-Z]+[-]?[A-Z]+$) | dickens | novel | 1842 | 1840 | 55 | 340276 |
| 1023 | Dickens/1023-bleak_house.txt | bleak house | ^\s*(PREFACE|CHAPTER\s*[IVXLCM]+)$ | dickens | novel | 1852 | 1850 | 68 | 357325 |
| 1289 | Dickens/1289-three_ghost_stories.txt | three ghost stories | THE HAUNTED HOUSE\.|THE TRIAL FOR MURDER\.|THE... | dickens | stories | 1860 | 1860 | 3 | 21150 |
| 1394 | Dickens/1394-the_holly_tree.txt | the holly tree | ^[A-Z]+\sBRANCH | dickens | stories | 1855 | 1850 | 3 | 13877 |
| 1400 | Dickens/1400-great_expectations.txt | great expectations | ^\s*Chapter\s*[IVXLCM]+ | dickens | novel | 1860 | 1860 | 59 | 185449 |
| 1406 | Dickens/1406-the_perils_of_certain_english_pri... | the perils of certain english prisoners | ^CHAPTER\s[IVXLCM]+ | dickens | stories | 1857 | 1850 | 2 | 19646 |
| 1407 | Dickens/1407-a_message_from_the_sea.txt | a message from the sea | ^CHAPTER\s[IVXLCM]+ | dickens | stories | 1860 | 1860 | 3 | 12416 |
| 1413 | Dickens/1413-tom_tiddlers_ground.txt | tom tiddlers ground | ^CHAPTER\s[IVXLCM]+ | dickens | stories | 1861 | 1860 | 3 | 9852 |
| 1414 | Dickens/1414-somebodys_luggage.txt | somebodys luggage | ^CHAPTER\s[IVXLCM]+ | dickens | stories | 1862 | 1860 | 4 | 19684 |
| 1415 | Dickens/1415-doctor_marigold.txt | doctor marigold | \* \* \* \* \* | dickens | stories | 1865 | 1860 | 2 | 2855 |
| 1416 | Dickens/1416-mrs_lirripers_lodgings.txt | mrs lirripers lodgings | ^CHAPTER\s[IVXLCM]+ | dickens | stories | 1863 | 1860 | 2 | 14437 |
| 1421 | Dickens/1421-mrs_lirripers_legacy.txt | mrs lirripers legacy | ^CHAPTER\s[IVXLCM]+ | dickens | stories | 1864 | 1860 | 2 | 12399 |
| 1435 | Dickens/1435-miscellaneous_papers.txt | miscellaneous papers | THE AGRICULTURAL INTEREST$|THREATENING LETTER ... | dickens | non-fiction | 1840 | 1840 | 9 | 23762 |
| 1467 | Dickens/1467-some_christmas_stories.txt | some christmas stories | A CHRISTMAS TREE[\.]?|WHAT CHRISTMAS IS AS WE ... | dickens | stories | 1850 | 1850 | 6 | 20947 |
| 2324 | Dickens/2324-a_house_to_let.txt | a house to let | OVER THE WAY$|THE MANCHESTER MARRIAGE$|GOING I... | dickens | stories | 1858 | 1850 | 6 | 34132 |
| 19337 | Dickens/19337-a_christmas_carol.txt | a christmas carol | ^\s*STAVE\s[A-Z]+$ | dickens | novel | 1843 | 1840 | 5 | 28828 |
| 20795 | Dickens/20795-the_cricket_on_the_hearth.txt | the cricket on the hearth | ^CHIRP\sTHE | dickens | novel | 1845 | 1840 | 3 | 31933 |
| 27924 | Dickens/27924-mugby_junction.txt | mugby junction | BARBOX BROTHERS$|BARBOX BROTHERS AND CO\.$|MAI... | dickens | stories | 1866 | 1860 | 7 | 50083 |
| 35536 | Dickens/35536-the_poems_and_verses_of_charles_... | the poems and verses of charles dickens | THE VILLAGE COQUETTES$|THE LAMPLIGHTER$|SONGS ... | dickens | stories | 1885 | 1880 | 13 | 10952 |
CORPUS
| pos_tuple | pos | token_str | term_str | |||||
|---|---|---|---|---|---|---|---|---|
| book_id | chap_id | para_num | sent_num | token_num | ||||
| 98 | 1 | 0 | 0 | 0 | ('The', 'DT') | DT | The | the |
| 1 | ('Period', 'NN') | NN | Period | period | ||||
| 1 | 0 | 0 | ('It', 'PRP') | PRP | It | it | ||
| 1 | ('was', 'VBD') | VBD | was | was | ||||
| 2 | ('the', 'DT') | DT | the | the | ||||
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 35536 | 13 | 16 | 0 | 12 | ('Charles', 'NNP') | NNP | Charles | charles |
| 13 | ('Dickens,', 'NNP') | NNP | Dickens, | dickens | ||||
| 14 | ('by', 'IN') | IN | by | by | ||||
| 15 | ('Charles', 'NNP') | NNP | Charles | charles | ||||
| 16 | ('Dickens', 'NNP') | NNP | Dickens | dickens |
4969964 rows × 4 columns
V_TRAIN = sorted(list(set(VOCAB.index)))
len(V_TRAIN)
55271
# convert col type to str otherwise errors when generating training sentences
CORPUS['term_str'] = CORPUS.term_str.astype('str')
CORPUS['token_str'] = CORPUS.term_str.astype('str')
S_TRAIN = list(CORPUS.groupby(OHCO[:-1]).term_str.apply(lambda x: ' '.join(x)).values)
len(S_TRAIN)
238366
S_TRAIN[:5]
['the period', 'it was the best of times it was the worst of times it was the age of wisdom it was the age of foolishness it was the epoch of belief it was the epoch of incredulity it was the season of light it was the season of darkness it was the spring of hope it was the winter of despair we had everything before us we had nothing before us we were all going direct to heaven we were all going direct the other way in short the period was so far like the present period that some of its noisiest authorities insisted on its being received for good or for evil in the superlative degree of comparison only', 'there were a king with a large jaw and a queen with a plain face on the throne of england there were a king with a large jaw and a queen with a fair face on the throne of france', 'in both countries it was clearer than crystal to the lords of the state preserves of loaves and fishes that things in general were settled for ever', 'it was the year of our lord one thousand seven hundred and seventy five']
train = NgramCounter(S_TRAIN, V_TRAIN)
train.generate()
train.I
| w0 | w1 | w2 | ||
|---|---|---|---|---|
| sent_num | token_num | |||
| 0 | 0 | <s> | <s> | the |
| 1 | <s> | the | period | |
| 2 | the | period | </s> | |
| 3 | period | </s> | <s> | |
| 4 | </s> | <s> | <s> | |
| ... | ... | ... | ... | ... |
| 238365 | 15 | dickens | by | charles |
| 16 | by | charles | dickens | |
| 17 | charles | dickens | </s> | |
| 18 | dickens | </s> | NaN | |
| 19 | </s> | NaN | NaN |
5685062 rows × 3 columns
stop_words = VOCAB.loc[VOCAB.stop == 1].index.values
unigram_df = train.LM[0].sort_values('n', ascending = False)
unigram_df
| n | mle | p | log_p | |
|---|---|---|---|---|
| w0 | ||||
| <s> | 476732 | 8.385696e-02 | 8.385696e-02 | -3.575926 |
| the | 257061 | 4.521692e-02 | 4.521692e-02 | -4.466993 |
| </s> | 238366 | 4.192848e-02 | 4.192848e-02 | -4.575926 |
| and | 181975 | 3.200933e-02 | 3.200933e-02 | -4.965364 |
| of | 136169 | 2.395207e-02 | 2.395207e-02 | -5.383706 |
| ... | ... | ... | ... | ... |
| modernizing | 1 | 1.758996e-07 | 1.758996e-07 | -22.438745 |
| modernised | 1 | 1.758996e-07 | 1.758996e-07 | -22.438745 |
| brimmy | 1 | 1.758996e-07 | 1.758996e-07 | -22.438745 |
| moderator | 1 | 1.758996e-07 | 1.758996e-07 | -22.438745 |
| ěngine | 1 | 1.758996e-07 | 1.758996e-07 | -22.438745 |
55272 rows × 4 columns
unigram_df.filter(regex = '^[^<]', axis = 0)
| n | mle | p | log_p | |
|---|---|---|---|---|
| w0 | ||||
| the | 257061 | 4.521692e-02 | 4.521692e-02 | -4.466993 |
| and | 181975 | 3.200933e-02 | 3.200933e-02 | -4.965364 |
| of | 136169 | 2.395207e-02 | 2.395207e-02 | -5.383706 |
| to | 131034 | 2.304883e-02 | 2.304883e-02 | -5.439163 |
| a | 112660 | 1.981685e-02 | 1.981685e-02 | -5.657129 |
| ... | ... | ... | ... | ... |
| modernizing | 1 | 1.758996e-07 | 1.758996e-07 | -22.438745 |
| modernised | 1 | 1.758996e-07 | 1.758996e-07 | -22.438745 |
| brimmy | 1 | 1.758996e-07 | 1.758996e-07 | -22.438745 |
| moderator | 1 | 1.758996e-07 | 1.758996e-07 | -22.438745 |
| ěngine | 1 | 1.758996e-07 | 1.758996e-07 | -22.438745 |
55270 rows × 4 columns
bigram_df = train.LM[1].sort_values('n', ascending = False)
bigram_df
| n | mle | mle2 | p | log_p | ||
|---|---|---|---|---|---|---|
| w0 | w1 | |||||
| <s> | <s> | 238366 | 4.192849e-02 | 0.500000 | 0.448055 | -1.158253 |
| </s> | <s> | 238365 | 4.192831e-02 | 0.999996 | 0.811768 | -0.300860 |
| of | the | 27530 | 4.842516e-03 | 0.202175 | 0.143809 | -2.797771 |
| <s> | i | 23498 | 4.133289e-03 | 0.049290 | 0.044171 | -4.500766 |
| in | the | 22859 | 4.020889e-03 | 0.258294 | 0.159002 | -2.652885 |
| ... | ... | ... | ... | ... | ... | ... |
| his | doubled | 1 | 1.758996e-07 | 0.000016 | 0.000017 | -15.828756 |
| doublet | 1 | 1.758996e-07 | 0.000016 | 0.000017 | -15.828756 | |
| doubt | 1 | 1.758996e-07 | 0.000016 | 0.000017 | -15.828756 | |
| doughty | 1 | 1.758996e-07 | 0.000016 | 0.000017 | -15.828756 | |
| ěngine | driver | 1 | 1.758996e-07 | 1.000000 | 0.000036 | -14.754287 |
1027728 rows × 5 columns
reduced_bigram = bigram_df.reset_index()
# remove spaces
reduced_bigram = reduced_bigram.loc[~((reduced_bigram.w0.str.contains('<')) | (reduced_bigram.w1.str.contains('<')))]
# remove stop words
reduced_bigram = reduced_bigram.loc[~((reduced_bigram.w0.isin(stop_words)) | (reduced_bigram.w1.isin(stop_words)))].set_index(['w0', 'w1'])
reduced_bigram
| n | mle | mle2 | p | log_p | ||
|---|---|---|---|---|---|---|
| w0 | w1 | |||||
| said | mr | 4790 | 8.425591e-04 | 0.160523 | 0.056291 | -4.150964 |
| mr | pickwick | 2062 | 3.627050e-04 | 0.068685 | 0.024187 | -5.369612 |
| said | mrs | 1379 | 2.425656e-04 | 0.046213 | 0.016214 | -5.946622 |
| dont | know | 1304 | 2.293731e-04 | 0.198962 | 0.021108 | -5.566092 |
| old | man | 1215 | 2.137180e-04 | 0.121841 | 0.018638 | -5.745630 |
| ... | ... | ... | ... | ... | ... | ... |
| hired | phaeton | 1 | 1.758996e-07 | 0.010526 | 0.000036 | -14.756739 |
| post | 1 | 1.758996e-07 | 0.010526 | 0.000036 | -14.756739 | |
| service | 1 | 1.758996e-07 | 0.010526 | 0.000036 | -14.756739 | |
| serving | 1 | 1.758996e-07 | 0.010526 | 0.000036 | -14.756739 | |
| ěngine | driver | 1 | 1.758996e-07 | 1.000000 | 0.000036 | -14.754287 |
476314 rows × 5 columns
trigram_df = train.LM[2].sort_values('n', ascending = False)
trigram_df
| n | mle | mle2 | p | log_p | |||
|---|---|---|---|---|---|---|---|
| w0 | w1 | w2 | |||||
| </s> | <s> | <s> | 238365 | 4.192832e-02 | 1.000000 | 7.801894e-05 | -13.645816 |
| <s> | <s> | i | 23498 | 4.133290e-03 | 0.098579 | 7.691395e-06 | -16.988323 |
| the | 15448 | 2.717298e-03 | 0.064808 | 5.056571e-06 | -17.593409 | ||
| he | 9972 | 1.754071e-03 | 0.041835 | 3.264236e-06 | -18.224823 | ||
| it | 8071 | 1.419686e-03 | 0.033860 | 2.642025e-06 | -18.529924 | ||
| ... | ... | ... | ... | ... | ... | ... | ... |
| he | closed | one | 1 | 1.758996e-07 | 0.025641 | 6.546658e-10 | -30.508522 |
| with | 1 | 1.758996e-07 | 0.025641 | 6.546658e-10 | -30.508522 | ||
| closes | early | 1 | 1.758996e-07 | 0.500000 | 6.546658e-10 | -30.508522 | |
| his | 1 | 1.758996e-07 | 0.500000 | 6.546658e-10 | -30.508522 | ||
| ěngine | driver | whom | 1 | 1.758996e-07 | 1.000000 | 6.546658e-10 | -30.508522 |
2917374 rows × 5 columns
reduced_trigram = trigram_df.reset_index()
# remove spaces
reduced_trigram = reduced_trigram.loc[~((reduced_trigram.w0.str.contains('<')) | (reduced_trigram.w1.str.contains('<')) | (reduced_trigram.w2.str.contains('<')))]
# remove stop words
reduced_trigram = reduced_trigram.loc[~((reduced_trigram.w0.isin(stop_words)) | (reduced_trigram.w1.isin(stop_words)) | (reduced_trigram.w2.isin(stop_words)))].set_index(['w0', 'w1', 'w2'])
reduced_trigram
| n | mle | mle2 | p | log_p | |||
|---|---|---|---|---|---|---|---|
| w0 | w1 | w2 | |||||
| said | mr | pickwick | 598 | 1.051880e-04 | 0.124843 | 1.960721e-07 | -22.282112 |
| sir | said | mr | 322 | 5.663968e-05 | 0.268110 | 1.057285e-07 | -23.173133 |
| said | mr | pecksniff | 247 | 4.344721e-05 | 0.051566 | 8.117843e-08 | -23.554328 |
| boffin | 203 | 3.570763e-05 | 0.042380 | 6.677580e-08 | -23.836099 | ||
| dombey | 201 | 3.535583e-05 | 0.041962 | 6.612114e-08 | -23.850313 | ||
| ... | ... | ... | ... | ... | ... | ... | ... |
| havisham | dwelt | upon | 1 | 1.758996e-07 | 1.000000 | 6.546658e-10 | -30.508522 |
| ever | made | 1 | 1.758996e-07 | 1.000000 | 6.546658e-10 | -30.508522 | |
| going | along | 1 | 1.758996e-07 | 1.000000 | 6.546658e-10 | -30.508522 | |
| horribly | cruel | 1 | 1.758996e-07 | 1.000000 | 6.546658e-10 | -30.508522 | |
| however | sir | 1 | 1.758996e-07 | 1.000000 | 6.546658e-10 | -30.508522 |
271763 rows × 5 columns
LIB.type.value_counts()
stories 26 novel 17 non-fiction 7 Name: type, dtype: int64
# generate n-gram model using n-gram counter
model = NgramLanguageModel(train)
# implement smoothing (with k = 1) to prevent zero prob. for n-grams unseen in training data
model.k = 1
model.apply_smoothing()
# list of ngram token tables for unigram, bigram, trigram
ngram = model.NG
# list of unigram, bigram, trigram dfs with count, max likelihood estimate (MLE), joint prob, log joint prob, conditional prob, log conditional prob
LM = model.LM
Z1 = model.Z1
Z2 = model.Z2
.generate_text() method of the langmod.NgramLanguageModel object (model)¶model.generate_text()
01. I NEVER THOUGHT ABOUT GOING TO BE DYING IN THAT NEIGHBOURHOOD BY ACCIDENT. 02. YES SIR. 03. OF OUR TWO INDIVIDUAL NATURES AND THEY LIVE IN A MOMENT. 04. GIVE HIM TO GO OUT WITH THE OBJECT OF THE CONTENTS OF HIS. 05. BURIED HOW LONG THAT HE WAS CARRIED OFF UPON HER AT THE ALBION IN LITTLE RUSSELL STREET. 06. FOR A MOMENT. 07. MUCH NEEDED COULD A FEW LEAVES AND THERE HE SAID AGAIN. 08. YES CHILD SAID THE MILITARY GENTLEMAN ADDRESSING THE OLD GENTLEMAN NOT EIGHT BLUE SKIES IN AS INDIFFERENT AND SO HAVE I MAY SAID MR CHESTER. 09. IT HAD RECEIVED IN TURF AND THE SEDATE FACE IN THE COMPANY OF A BUCKET AND MRS QUILP HE SAID THAT WHEREVER GAY IS PROBABLY ACQUAINTEDCANNOT SAY A COINCIDENCE A REMARKABLE KIND OF NEAT LITTLE ROOM PUT A BRIGHTER COUNTENANCE AND AS THERE CAN BE TAKEN OFF THE GROUND THE STOPPING OF THE UNFORTUNATE CHILDREN WHOSE WERRY SUBSISTENCE DEPENDS ON THEIR WAY. 10. O NO. 11. I HAD TO TELL ME ALL THEM ANCESTORS OF YOURS I BEG YOUR PARDON AND TELL US. 12. MY DEAR BOY ME AND THANK GOD SAID THE MAGISTRATE. 13. NOT YOU SAYS TWEMLOW AT THE LODGE WHERE SUCH DREAMS COME. 14. THAT DOOR AND SAID THAT BEFORE THE HOUSE. 15. AND WHY IN A VERY DECENT AIM AT ME SIRRAH DEMANDED RALPH. 16. ALL REPLIED NICHOLAS. 17. EVENSON. 18. MR PECKSNIFF SEVERAL TIMES LEAVING MY FRIEND DOMBEY WILL BE PLEASED THAT HE WAS READING. 19. OTHER SOUND INAUDIBLE. 20. I WONDER HOW IT CHIRPED.
.mle in the unigram, bigram, and trigram tables in the language modelV = len(VOCAB)
R = []
for i in range(3):
N = V**(i+1)
H = (train.LM[i]['mle'] * np.log2(1/train.LM[i]['mle'])).sum()
Hmax = np.log2(N)
R.append(int(round(1 - H/Hmax, 2) * 100))
R
[41, 50, 59]
BGX = model.LM[1].n.unstack() so use method below), explore the relationship between bigram pairs using the following lists for the first and second words of the bigrams of interest¶w0 = ['he', 'she']
w1 = ['said', 'heard']
bigram_pairs = [i for i in itertools.combinations(w0 + w1, 2) if i[0] in w0 and i[1] in w1]
LM[1].loc[bigram_pairs].n.unstack()
| w1 | said | heard |
|---|---|---|
| w0 | ||
| he | 1855 | 197 |
| she | 692 | 59 |
VOCAB.n.sort_values().plot(ylabel = "log_frequency", logy=True, style = '.', rot = 45, title = "Log Term Frequency");
VOCAB¶if 'term_rank' not in VOCAB.columns:
VOCAB = VOCAB.sort_values('n', ascending = False).reset_index()
VOCAB.index.name = 'term_rank'
VOCAB = VOCAB.reset_index()
VOCAB['term_rank'] = VOCAB['term_rank'] + 1
VOCAB = VOCAB.set_index('term_str')
VOCAB.term_rank.plot(ylabel = "term_rank", logx = False, rot = 45, title = "Term Rank");
VOCAB.head()
| term_rank | n | n_chars | p | i | max_pos | n_pos | cat_pos | stop | stem_porter | stem_snowball | stem_lancaster | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| term_str | ||||||||||||
| the | 1 | 257061 | 3 | 0.051723 | 4.273054 | DT | 20 | {'NNP', 'CD', 'NNS', 'FW', 'POS', 'PRP', 'CC',... | 1 | the | the | the |
| and | 2 | 181975 | 3 | 0.036615 | 4.771425 | CC | 18 | {'NNP', 'CC', 'VBD', 'CD', 'DT', 'PDT', 'VBN',... | 1 | and | and | and |
| of | 3 | 136169 | 2 | 0.027398 | 5.189767 | IN | 18 | {'NNP', 'PRP', 'VBD', 'CD', 'PDT', 'VBN', 'VBP... | 1 | of | of | of |
| to | 4 | 131034 | 2 | 0.026365 | 5.245224 | TO | 21 | {'NNP', 'CD', 'EX', 'NNS', 'FW', 'POS', 'CC', ... | 1 | to | to | to |
| a | 5 | 112660 | 1 | 0.022668 | 5.463190 | DT | 19 | {'NNP', 'CD', 'NNS', 'FW', 'POS', 'PRP', 'VBN'... | 1 | a | a | a |
# times each num of times a term appears (e.g., 18273 terms appear 1 time)
# sort in descending order
# reset indices and rename cols --> nn: times each num of times a term appears
new_rank = VOCAB.n.value_counts()\
.sort_index(ascending = False).reset_index().reset_index()\
.rename(columns={'level_0': 'term_rank2', 'index': 'n', 'n': 'nn'})\
.set_index('n')
VOCAB['term_rank2'] = VOCAB.n.map(new_rank.term_rank2) + 1
VOCAB.term_rank2.plot(ylabel = 'term_rank2', logx = False, rot = 45, title = "Term Rank 2 (Words with Same Frequency Assigned Equal Rank)");
term_rank and term_rank2¶VOCAB['zipf_k'] = VOCAB.n * VOCAB.term_rank
VOCAB['zipf_k2'] = VOCAB.n * VOCAB.term_rank2
VOCAB.zipf_k.plot(style = ',', rot = 45);
VOCAB.zipf_k2.plot(style = ',', rot = 45);
n)¶As rank (term_rank2) increases, frequnecy (n) decreases
# scatter plot of term_rank2 vs. n color coded by part of speech (POS)
px.scatter(VOCAB.reset_index(),
x = 'term_rank2', y = 'n',
title = 'Term Rank (2) vs. Frequency (n)',
log_y = False, log_x = False,
hover_name = 'term_str',
color = 'max_pos',
height = 500, width = 800)
BOW = create_bow(CORPUS, CHAPS)
DTCM, TFIDF, BOW, DFIDF, VOCAB = get_tfidf(BOW, VOCAB, tf_method = 'max', idf_method = 'standard')
VOCAB
| term_rank | n | n_chars | p | i | max_pos | n_pos | cat_pos | stop | stem_porter | stem_snowball | stem_lancaster | term_rank2 | zipf_k | zipf_k2 | tfidf_mean_chap_max | tfidf_max_chap_max | df | idf | dfidf | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| term_str | ||||||||||||||||||||
| the | 1 | 257061 | 3 | 5.172286e-02 | 4.273054 | DT | 20 | {'NNP', 'CD', 'NNS', 'FW', 'POS', 'PRP', 'CC',... | 1 | the | the | the | 1 | 257061 | 257061 | 0.000000 | 0.000000 | 1182.0 | 0.000000 | 0.000000 |
| and | 2 | 181975 | 3 | 3.661492e-02 | 4.771425 | CC | 18 | {'NNP', 'CC', 'VBD', 'CD', 'DT', 'PDT', 'VBN',... | 1 | and | and | and | 2 | 363950 | 363950 | 0.000000 | 0.000000 | 1182.0 | 0.000000 | 0.000000 |
| of | 3 | 136169 | 2 | 2.739836e-02 | 5.189767 | IN | 18 | {'NNP', 'PRP', 'VBD', 'CD', 'PDT', 'VBN', 'VBP... | 1 | of | of | of | 3 | 408507 | 408507 | 0.000000 | 0.000000 | 1182.0 | 0.000000 | 0.000000 |
| to | 4 | 131034 | 2 | 2.636515e-02 | 5.245224 | TO | 21 | {'NNP', 'CD', 'EX', 'NNS', 'FW', 'POS', 'CC', ... | 1 | to | to | to | 4 | 524136 | 524136 | 0.000000 | 0.000000 | 1182.0 | 0.000000 | 0.000000 |
| a | 5 | 112660 | 1 | 2.266815e-02 | 5.463190 | DT | 19 | {'NNP', 'CD', 'NNS', 'FW', 'POS', 'PRP', 'VBN'... | 1 | a | a | a | 5 | 563300 | 563300 | 0.000000 | 0.000000 | 1182.0 | 0.000000 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| lushed | 55268 | 1 | 6 | 2.012085e-07 | 22.244805 | VBP | 1 | {'VBP'} | 0 | lush | lush | lush | 1225 | 55268 | 1225 | 0.031601 | 0.031601 | 1.0 | 10.207014 | 10.207014 |
| lurker | 55269 | 1 | 6 | 2.012085e-07 | 22.244805 | NN | 1 | {'NN'} | 0 | lurker | lurker | lurk | 1225 | 55269 | 1225 | 0.046396 | 0.046396 | 1.0 | 10.207014 | 10.207014 |
| lunns | 55270 | 1 | 5 | 2.012085e-07 | 22.244805 | NNP | 1 | {'NNP'} | 0 | lunn | lunn | lun | 1225 | 55270 | 1225 | 0.029586 | 0.029586 | 1.0 | 10.207014 | 10.207014 |
| lungsa | 55271 | 1 | 6 | 2.012085e-07 | 22.244805 | NN | 1 | {'NN'} | 0 | lungsa | lungsa | lungs | 1225 | 55271 | 1225 | 0.054583 | 0.054583 | 1.0 | 10.207014 | 10.207014 |
| ěngine | 55272 | 1 | 6 | 2.012085e-07 | 22.244805 | NNP | 1 | {'NNP'} | 0 | ěngine | ěngine | ěngine | 1225 | 55272 | 1225 | 0.011180 | 0.011180 | 1.0 | 10.207014 | 10.207014 |
55272 rows × 20 columns
DTCM¶DTCM
| term_str | 0 | 1 | 10 | 100 | 1000 | 10000 | 10000l | 1000l | 100l | 1030 | ... | zooks | zoological | zounds | zulu | à | æolian | æsop | éclat | élite | ěngine | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| book_id | chap_id | |||||||||||||||||||||
| 98 | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 3 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 4 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 35536 | 9 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 10 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 11 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 12 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 13 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1182 rows × 55270 columns
VOCAB, TFIDF matrix to the 1000 most significant terms¶# open POS categories
open_cats = ['NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ',
'JJR', 'JJS', 'RB', 'RBR', 'RBS']
# reduce VOCAB to significant terms --> filter POS, sort , take top 1000
SIGS = VOCAB.loc[VOCAB.max_pos.isin(open_cats)] \
.sort_values('dfidf', ascending = False) \
.iloc[:1000,]
SIGS.head(10).index.values
array(['seem', 'entered', 'sleep', 'cut', 'top', 'windows', 'dress',
'thank', 'worse', 'sent'], dtype=object)
TFIDF
| term_str | 0 | 1 | 10 | 100 | 1000 | 10000 | 10000l | 1000l | 100l | 1030 | ... | zooks | zoological | zounds | zulu | à | æolian | æsop | éclat | élite | ěngine | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| book_id | chap_id | |||||||||||||||||||||
| 98 | 1 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 3 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 4 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 5 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 35536 | 9 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 10 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 11 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 12 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 13 | 0.0 | 0.255548 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1182 rows × 55270 columns
TFIDF_sigs = TFIDF[SIGS.index]
TFIDF_sigs
| term_str | seem | entered | sleep | cut | top | windows | dress | thank | worse | sent | ... | peculiar | pace | art | kindness | kissed | demanded | strong | felt | tall | profound | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| book_id | chap_id | |||||||||||||||||||||
| 98 | 1 | 0.0 | 0.000000 | 0.000000 | 0.018297 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.018129 | 0.000000 | ... | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 2 | 0.0 | 0.000000 | 0.000000 | 0.007300 | 0.014634 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.007351 | ... | 0.0 | 0.012152 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | |
| 3 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.011779 | 0.011779 | 0.000000 | 0.000000 | 0.000000 | 0.011833 | ... | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.018890 | 0.000000 | 0.000000 | 0.019668 | |
| 4 | 0.0 | 0.000000 | 0.007121 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.014143 | 0.021166 | 0.007170 | ... | 0.0 | 0.000000 | 0.011885 | 0.0 | 0.0 | 0.000000 | 0.015261 | 0.007630 | 0.023834 | 0.000000 | |
| 5 | 0.0 | 0.005042 | 0.000000 | 0.005054 | 0.010131 | 0.010131 | 0.005019 | 0.000000 | 0.005008 | 0.000000 | ... | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.008436 | 0.008124 | 0.005416 | 0.025375 | 0.008458 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 35536 | 9 | 0.0 | 0.000000 | 0.024499 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 10 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | |
| 11 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.011224 | 0.011224 | 0.000000 | 0.000000 | |
| 12 | 0.0 | 0.000000 | 0.027797 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.014894 | 0.000000 | 0.000000 | 0.046521 | |
| 13 | 0.0 | 0.000000 | 0.065703 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.109960 |
1182 rows × 1000 columns
print('max term TFIDF value:', max(TFIDF_sigs.sum(axis=0)))
print('max TFIDF term:', TFIDF_sigs.sum(axis = 0).idxmax())
max term TFIDF value: 16.677083003000984 max TFIDF term: says
print('Max total TFIDF value by book and chapter:', max(TFIDF_sigs.sum(axis=1)))
print('(Book_id, chap_id) with max total TFIDF: ', TFIDF_sigs.sum(axis = 1).idxmax())
print('Title of book with max total TFIDF:', LIB.loc[TFIDF_sigs.sum(axis = 1).idxmax()[0]].title.title())
Max total TFIDF value by book and chapter: 6.716943644781024 (Book_id, chap_id) with max total TFIDF: (786, 2) Title of book with max total TFIDF: Hard Times
px.scatter(VOCAB.reset_index(),
x = 'term_rank2', y = 'tfidf_mean_chap_max',
title = 'Term Rank vs. TFIDF Mean (with chaps as bags of words, max TF method)',
color = 'max_pos', size = 'n_pos',
hover_name = 'term_str', hover_data = ['n', 'i'],
log_y = True, log_x = False)
px.scatter(VOCAB.reset_index(),
x = 'term_rank2', y = 'dfidf',
title = 'Term Rank vs. DFIDF',
color = 'max_pos', size = 'n_pos',
hover_name = 'term_str', hover_data = ['n', 'i'])
# group by book_id (BOOKS = OHCO[:1])
mean_TFIDF = TFIDF.groupby(BOOKS).mean()
mean_TFIDF
| term_str | 0 | 1 | 10 | 100 | 1000 | 10000 | 10000l | 1000l | 100l | 1030 | ... | zooks | zoological | zounds | zulu | à | æolian | æsop | éclat | élite | ěngine |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| book_id | |||||||||||||||||||||
| 98 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 564 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 580 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 588 | 0.043555 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 644 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 650 | 0.000000 | 0.000558 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 653 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 675 | 0.000000 | 0.003390 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000621 |
| 676 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 699 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.014970 | 0.000829 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 700 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 730 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 766 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 786 | 0.006187 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 807 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 809 | 0.000000 | 0.005130 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 810 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 821 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 824 | 0.000000 | 0.020283 | 0.012691 | 0.005939 | 0.001913 | 0.001257 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.001011 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 872 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.001949 | 0.000000 | 0.00848 | 0.002936 | 0.001494 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 882 | 0.000166 | 0.000444 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.001721 | 0.000223 | 0.000223 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000764 | 0.000000 | 0.000000 | 0.000000 | 0.000814 | 0.000000 |
| 883 | 0.000000 | 0.000312 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 888 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 912 | 0.006222 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.016480 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 914 | 0.000000 | 0.001287 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.001353 | 0.000000 | 0.00000 | 0.001607 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 916 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 917 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000669 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 918 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.011379 | 0.000000 | 0.000000 |
| 922 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.008231 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 927 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 967 | 0.000000 | 0.000451 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000463 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 968 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 1023 | 0.000000 | 0.000285 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 1289 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 1394 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.003969 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 1400 | 0.000000 | 0.000696 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 1406 | 0.000000 | 0.006614 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 1407 | 0.000000 | 0.006741 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 1413 | 0.000000 | 0.030722 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 1414 | 0.381109 | 0.084841 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.003569 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 1415 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 1416 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 1421 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.005864 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 1435 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 1467 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.005797 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 2324 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 19337 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 20795 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 27924 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.009211 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 35536 | 0.000000 | 0.019658 | 0.019656 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.007747 | 0.000000 | 0.005948 | 0.000000 | 0.000000 | 0.000000 |
50 rows × 55270 columns
mean_TFIDF_sigs = TFIDF_sigs.groupby(BOOKS).mean()
mean_TFIDF_sigs
| term_str | seem | entered | sleep | cut | top | windows | dress | thank | worse | sent | ... | peculiar | pace | art | kindness | kissed | demanded | strong | felt | tall | profound |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| book_id | |||||||||||||||||||||
| 98 | 0.002393 | 0.003802 | 0.003404 | 0.002195 | 0.003124 | 0.004978 | 0.004351 | 0.005878 | 0.003747 | 0.002506 | ... | 0.001738 | 0.001090 | 0.001950 | 0.000459 | 0.006159 | 0.002523 | 0.006633 | 0.002546 | 0.003738 | 0.001328 |
| 564 | 0.010476 | 0.002771 | 0.004620 | 0.003763 | 0.004910 | 0.009007 | 0.004279 | 0.009971 | 0.004176 | 0.002999 | ... | 0.001572 | 0.003426 | 0.001709 | 0.005448 | 0.006385 | 0.004860 | 0.005815 | 0.002857 | 0.001353 | 0.000854 |
| 580 | 0.001633 | 0.006892 | 0.004072 | 0.003512 | 0.004892 | 0.002329 | 0.002140 | 0.003107 | 0.002812 | 0.003906 | ... | 0.002933 | 0.003073 | 0.001058 | 0.002970 | 0.002512 | 0.002451 | 0.002353 | 0.003885 | 0.005744 | 0.005382 |
| 588 | 0.005328 | 0.002433 | 0.003914 | 0.004394 | 0.005007 | 0.001695 | 0.001646 | 0.000554 | 0.002704 | 0.001996 | ... | 0.008652 | 0.004240 | 0.002823 | 0.003903 | 0.002602 | 0.003754 | 0.004059 | 0.007125 | 0.001124 | 0.002809 |
| 644 | 0.001645 | 0.002681 | 0.008029 | 0.002687 | 0.003306 | 0.003628 | 0.002669 | 0.003647 | 0.004668 | 0.001714 | ... | 0.001105 | 0.001105 | 0.000000 | 0.002215 | 0.012132 | 0.003323 | 0.001968 | 0.007221 | 0.000000 | 0.002849 |
| 650 | 0.002964 | 0.004217 | 0.002264 | 0.002753 | 0.005130 | 0.010857 | 0.004136 | 0.000236 | 0.004408 | 0.003582 | ... | 0.000643 | 0.002314 | 0.002853 | 0.000000 | 0.001313 | 0.000000 | 0.003488 | 0.002304 | 0.002603 | 0.000157 |
| 653 | 0.006889 | 0.002919 | 0.006016 | 0.006939 | 0.002016 | 0.003991 | 0.004952 | 0.006012 | 0.000000 | 0.001975 | ... | 0.003401 | 0.004923 | 0.000000 | 0.001748 | 0.005076 | 0.000000 | 0.004352 | 0.006439 | 0.000000 | 0.000000 |
| 675 | 0.005027 | 0.001698 | 0.003040 | 0.003983 | 0.005156 | 0.007240 | 0.003041 | 0.000871 | 0.002810 | 0.002865 | ... | 0.002345 | 0.004288 | 0.001284 | 0.001235 | 0.001501 | 0.000464 | 0.004678 | 0.002227 | 0.005158 | 0.000687 |
| 676 | 0.008455 | 0.001075 | 0.000930 | 0.006221 | 0.004716 | 0.002210 | 0.004477 | 0.004379 | 0.003253 | 0.005625 | ... | 0.001794 | 0.005137 | 0.001553 | 0.004255 | 0.009159 | 0.000000 | 0.000498 | 0.004095 | 0.000000 | 0.000000 |
| 699 | 0.000934 | 0.000836 | 0.000395 | 0.004907 | 0.001019 | 0.001196 | 0.003669 | 0.000059 | 0.003387 | 0.018815 | ... | 0.000000 | 0.000326 | 0.001023 | 0.000237 | 0.001597 | 0.003599 | 0.004672 | 0.000752 | 0.001947 | 0.000102 |
| 700 | 0.003311 | 0.005115 | 0.009056 | 0.002884 | 0.002256 | 0.004031 | 0.004122 | 0.005187 | 0.002878 | 0.002573 | ... | 0.002316 | 0.005891 | 0.001076 | 0.002826 | 0.001529 | 0.003137 | 0.004586 | 0.005661 | 0.002613 | 0.002682 |
| 730 | 0.001835 | 0.006976 | 0.005113 | 0.003666 | 0.004380 | 0.002570 | 0.002549 | 0.002576 | 0.005081 | 0.002758 | ... | 0.002065 | 0.007719 | 0.000776 | 0.001719 | 0.000718 | 0.009208 | 0.004346 | 0.004608 | 0.002600 | 0.002969 |
| 766 | 0.006026 | 0.003700 | 0.006636 | 0.003710 | 0.004879 | 0.004065 | 0.004315 | 0.007947 | 0.003071 | 0.004342 | ... | 0.001928 | 0.002125 | 0.005747 | 0.003855 | 0.005140 | 0.000783 | 0.006995 | 0.012639 | 0.001896 | 0.001839 |
| 786 | 0.004518 | 0.002056 | 0.004104 | 0.003240 | 0.000131 | 0.002032 | 0.004440 | 0.008957 | 0.005074 | 0.010989 | ... | 0.003008 | 0.001693 | 0.004750 | 0.005647 | 0.008408 | 0.001093 | 0.002019 | 0.006293 | 0.003405 | 0.002540 |
| 807 | 0.017699 | 0.000000 | 0.000000 | 0.002110 | 0.002115 | 0.002115 | 0.000000 | 0.016911 | 0.007345 | 0.013838 | ... | 0.000000 | 0.005468 | 0.024126 | 0.000000 | 0.000000 | 0.000000 | 0.005669 | 0.004539 | 0.000000 | 0.005147 |
| 809 | 0.000000 | 0.002962 | 0.000000 | 0.009059 | 0.007700 | 0.000000 | 0.001310 | 0.004514 | 0.001218 | 0.010815 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.007157 | 0.002475 | 0.003497 | 0.009514 | 0.000000 |
| 810 | 0.007320 | 0.006486 | 0.000000 | 0.000000 | 0.000000 | 0.010435 | 0.000000 | 0.005908 | 0.000000 | 0.001155 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.008936 | 0.005043 | 0.007205 | 0.000000 | 0.000000 |
| 821 | 0.003427 | 0.003858 | 0.004322 | 0.003516 | 0.002988 | 0.004392 | 0.003857 | 0.008407 | 0.002865 | 0.003103 | ... | 0.002106 | 0.002655 | 0.001874 | 0.003326 | 0.004993 | 0.001428 | 0.003638 | 0.007421 | 0.003945 | 0.002296 |
| 824 | 0.002665 | 0.000538 | 0.000719 | 0.000705 | 0.000981 | 0.000272 | 0.000527 | 0.008800 | 0.001557 | 0.000748 | ... | 0.005568 | 0.000506 | 0.027689 | 0.007908 | 0.000000 | 0.002971 | 0.005279 | 0.006542 | 0.001224 | 0.001964 |
| 872 | 0.004458 | 0.001253 | 0.006411 | 0.004847 | 0.005617 | 0.004636 | 0.001825 | 0.000664 | 0.004523 | 0.004990 | ... | 0.002821 | 0.001636 | 0.007345 | 0.000376 | 0.002124 | 0.001857 | 0.003248 | 0.001346 | 0.001675 | 0.003855 |
| 882 | 0.002308 | 0.003893 | 0.002298 | 0.003083 | 0.005221 | 0.004121 | 0.006375 | 0.001652 | 0.001933 | 0.004494 | ... | 0.004629 | 0.001616 | 0.001580 | 0.001613 | 0.000764 | 0.000305 | 0.001745 | 0.002904 | 0.005331 | 0.002684 |
| 883 | 0.006677 | 0.002344 | 0.003785 | 0.006628 | 0.003369 | 0.003248 | 0.006358 | 0.008766 | 0.006230 | 0.002494 | ... | 0.001602 | 0.002741 | 0.004119 | 0.002358 | 0.005897 | 0.005219 | 0.003976 | 0.004487 | 0.001988 | 0.000710 |
| 888 | 0.003790 | 0.003175 | 0.003524 | 0.002611 | 0.006719 | 0.006547 | 0.001476 | 0.001917 | 0.001929 | 0.001770 | ... | 0.005863 | 0.003556 | 0.003890 | 0.000919 | 0.000566 | 0.002164 | 0.002159 | 0.003987 | 0.001777 | 0.000000 |
| 912 | 0.000000 | 0.005130 | 0.001595 | 0.001858 | 0.008419 | 0.001955 | 0.005252 | 0.003798 | 0.000839 | 0.002670 | ... | 0.006565 | 0.000592 | 0.000000 | 0.008827 | 0.000000 | 0.000706 | 0.002817 | 0.003740 | 0.004957 | 0.003441 |
| 914 | 0.005221 | 0.001883 | 0.003344 | 0.003503 | 0.003771 | 0.005822 | 0.003032 | 0.000712 | 0.003725 | 0.003367 | ... | 0.003214 | 0.001392 | 0.004297 | 0.000600 | 0.000226 | 0.000208 | 0.004134 | 0.002591 | 0.004384 | 0.003482 |
| 916 | 0.005401 | 0.000000 | 0.000000 | 0.002974 | 0.002333 | 0.001161 | 0.008105 | 0.001173 | 0.001456 | 0.000000 | ... | 0.001947 | 0.002445 | 0.005029 | 0.000000 | 0.004767 | 0.000000 | 0.002618 | 0.002251 | 0.004978 | 0.002291 |
| 917 | 0.003310 | 0.005957 | 0.007799 | 0.003352 | 0.003465 | 0.004823 | 0.006813 | 0.005851 | 0.004272 | 0.003438 | ... | 0.002395 | 0.007806 | 0.001522 | 0.001387 | 0.001697 | 0.005253 | 0.006328 | 0.004896 | 0.004645 | 0.003861 |
| 918 | 0.001018 | 0.001018 | 0.000000 | 0.001635 | 0.001527 | 0.002882 | 0.001434 | 0.001600 | 0.000000 | 0.001027 | ... | 0.018174 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.005062 | 0.004096 | 0.000000 | 0.006532 |
| 922 | 0.005485 | 0.000000 | 0.000000 | 0.000000 | 0.001632 | 0.003634 | 0.006466 | 0.000000 | 0.003226 | 0.005353 | ... | 0.000000 | 0.003221 | 0.005947 | 0.000000 | 0.000000 | 0.000000 | 0.001778 | 0.000000 | 0.000000 | 0.008687 |
| 927 | 0.000000 | 0.003765 | 0.000000 | 0.007548 | 0.003783 | 0.000000 | 0.000000 | 0.000000 | 0.003739 | 0.000000 | ... | 0.000000 | 0.000000 | 0.012598 | 0.000000 | 0.006299 | 0.006299 | 0.002022 | 0.002022 | 0.006316 | 0.000000 |
| 967 | 0.005470 | 0.002989 | 0.004341 | 0.003985 | 0.005311 | 0.002392 | 0.003876 | 0.004675 | 0.004885 | 0.004470 | ... | 0.002716 | 0.002523 | 0.001603 | 0.005545 | 0.003157 | 0.008412 | 0.003932 | 0.005478 | 0.004314 | 0.003798 |
| 968 | 0.004166 | 0.004235 | 0.005099 | 0.004158 | 0.005300 | 0.002521 | 0.003120 | 0.006518 | 0.004579 | 0.003027 | ... | 0.004872 | 0.002714 | 0.004911 | 0.005783 | 0.003040 | 0.002184 | 0.006322 | 0.005971 | 0.002106 | 0.001408 |
| 1023 | 0.004549 | 0.001671 | 0.004547 | 0.002342 | 0.002594 | 0.005166 | 0.005302 | 0.010107 | 0.004819 | 0.004440 | ... | 0.001614 | 0.002826 | 0.003793 | 0.005512 | 0.004047 | 0.000091 | 0.003091 | 0.007054 | 0.002303 | 0.002244 |
| 1289 | 0.005463 | 0.005458 | 0.005007 | 0.008711 | 0.008324 | 0.008017 | 0.002515 | 0.000000 | 0.000701 | 0.003263 | ... | 0.012729 | 0.000000 | 0.001181 | 0.000000 | 0.000000 | 0.000000 | 0.000379 | 0.003718 | 0.001184 | 0.000000 |
| 1394 | 0.003727 | 0.000000 | 0.006181 | 0.007812 | 0.011454 | 0.007013 | 0.002900 | 0.008700 | 0.017799 | 0.000000 | ... | 0.001357 | 0.002714 | 0.001361 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.006441 | 0.006822 | 0.000000 |
| 1400 | 0.002926 | 0.004321 | 0.003929 | 0.007371 | 0.004933 | 0.006148 | 0.006626 | 0.004032 | 0.005091 | 0.006407 | ... | 0.000947 | 0.000515 | 0.000721 | 0.001353 | 0.001082 | 0.003307 | 0.005602 | 0.013043 | 0.001784 | 0.002195 |
| 1406 | 0.001054 | 0.002751 | 0.005927 | 0.008684 | 0.002118 | 0.001704 | 0.006936 | 0.003378 | 0.003141 | 0.003840 | ... | 0.000000 | 0.000000 | 0.002838 | 0.000000 | 0.004602 | 0.000000 | 0.003521 | 0.002610 | 0.000000 | 0.000000 |
| 1407 | 0.000000 | 0.003458 | 0.000000 | 0.003466 | 0.002137 | 0.002137 | 0.008025 | 0.002117 | 0.001669 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.006451 | 0.000000 | 0.004753 | 0.003928 | 0.003568 | 0.000000 |
| 1413 | 0.002205 | 0.000000 | 0.009781 | 0.002210 | 0.000000 | 0.004102 | 0.002195 | 0.000000 | 0.007826 | 0.004451 | ... | 0.000000 | 0.003133 | 0.003689 | 0.000000 | 0.003689 | 0.000000 | 0.003025 | 0.000000 | 0.000000 | 0.000000 |
| 1414 | 0.002077 | 0.001345 | 0.002859 | 0.002859 | 0.000000 | 0.008698 | 0.000000 | 0.002111 | 0.006391 | 0.006581 | ... | 0.001220 | 0.000000 | 0.030658 | 0.000000 | 0.002447 | 0.007096 | 0.002953 | 0.003415 | 0.000000 | 0.002453 |
| 1415 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.006526 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.019669 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.006977 | 0.000000 | 0.000000 |
| 1416 | 0.000000 | 0.001152 | 0.003464 | 0.003464 | 0.000000 | 0.001157 | 0.001147 | 0.000000 | 0.005720 | 0.002325 | ... | 0.000000 | 0.000000 | 0.000000 | 0.001927 | 0.016363 | 0.000000 | 0.001237 | 0.002474 | 0.005797 | 0.000000 |
| 1421 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.012617 | 0.002415 | 0.002393 | 0.012502 | 0.010086 | 0.004852 | ... | 0.000000 | 0.000000 | 0.000000 | 0.002010 | 0.035991 | 0.000000 | 0.000645 | 0.012844 | 0.000000 | 0.000000 |
| 1435 | 0.001261 | 0.000848 | 0.002496 | 0.002113 | 0.001916 | 0.000000 | 0.004557 | 0.000844 | 0.001526 | 0.003026 | ... | 0.005681 | 0.000000 | 0.027978 | 0.000000 | 0.000000 | 0.000000 | 0.003451 | 0.003101 | 0.000000 | 0.002240 |
| 1467 | 0.005943 | 0.001306 | 0.006087 | 0.004426 | 0.001110 | 0.008658 | 0.000000 | 0.000550 | 0.002395 | 0.009650 | ... | 0.002989 | 0.000922 | 0.000924 | 0.007275 | 0.006047 | 0.000000 | 0.002151 | 0.001960 | 0.000927 | 0.000000 |
| 2324 | 0.001191 | 0.004067 | 0.003903 | 0.002297 | 0.004299 | 0.006567 | 0.002526 | 0.006794 | 0.005242 | 0.013288 | ... | 0.001729 | 0.000000 | 0.011762 | 0.005427 | 0.002350 | 0.001733 | 0.007658 | 0.009119 | 0.001858 | 0.000000 |
| 19337 | 0.000790 | 0.007504 | 0.002705 | 0.006386 | 0.006561 | 0.007194 | 0.006789 | 0.002413 | 0.002633 | 0.003297 | ... | 0.001046 | 0.004044 | 0.000000 | 0.001693 | 0.007821 | 0.004055 | 0.003088 | 0.005422 | 0.002749 | 0.001052 |
| 20795 | 0.001823 | 0.005518 | 0.007296 | 0.002834 | 0.005553 | 0.001894 | 0.002724 | 0.000000 | 0.004632 | 0.002837 | ... | 0.000000 | 0.006062 | 0.004599 | 0.007857 | 0.003077 | 0.001550 | 0.003461 | 0.004971 | 0.003135 | 0.000000 |
| 27924 | 0.006111 | 0.002694 | 0.004254 | 0.004628 | 0.005332 | 0.002912 | 0.001697 | 0.002605 | 0.002842 | 0.003893 | ... | 0.006131 | 0.008097 | 0.000766 | 0.001454 | 0.005485 | 0.000000 | 0.002421 | 0.009927 | 0.004540 | 0.000753 |
| 35536 | 0.000000 | 0.001656 | 0.013753 | 0.000000 | 0.000000 | 0.000000 | 0.004330 | 0.002165 | 0.002978 | 0.000000 | ... | 0.000000 | 0.000000 | 0.002209 | 0.003639 | 0.000000 | 0.001586 | 0.003679 | 0.002642 | 0.002215 | 0.012037 |
50 rows × 1000 columns
DOC Table¶LIB table when books are docsbook_DOC = pd.DataFrame(index = mean_TFIDF.index)
book_DOC = book_DOC.join(LIB[['author', 'title']])
book_DOC['label'] = book_DOC.apply(lambda x: f"{x.author.split(',')[0]} {x.name}: {x.title}", 1)
book_DOC
| author | title | label | |
|---|---|---|---|
| book_id | |||
| 98 | dickens | a tale of two cities | dickens 98: a tale of two cities |
| 564 | dickens | the mystery of edwin drood | dickens 564: the mystery of edwin drood |
| 580 | dickens | the pickwick papers | dickens 580: the pickwick papers |
| 588 | dickens | master humphreys clock | dickens 588: master humphreys clock |
| 644 | dickens | the haunted man and the ghosts bargain | dickens 644: the haunted man and the ghosts ba... |
| 650 | dickens | pictures from italy | dickens 650: pictures from italy |
| 653 | dickens | the chimes | dickens 653: the chimes |
| 675 | dickens | american notes | dickens 675: american notes |
| 676 | dickens | the battle of life | dickens 676: the battle of life |
| 699 | dickens | a childs history of england | dickens 699: a childs history of england |
| 700 | dickens | the old curiosity shop | dickens 700: the old curiosity shop |
| 730 | dickens | oliver twist | dickens 730: oliver twist |
| 766 | dickens | david copperfield | dickens 766: david copperfield |
| 786 | dickens | hard times | dickens 786: hard times |
| 807 | dickens | hunted down | dickens 807: hunted down |
| 809 | dickens | holiday romance | dickens 809: holiday romance |
| 810 | dickens | george silvermans explanation | dickens 810: george silvermans explanation |
| 821 | dickens | dombey and sons | dickens 821: dombey and sons |
| 824 | dickens | speeches of charles dickens | dickens 824: speeches of charles dickens |
| 872 | dickens | reprinted pieces | dickens 872: reprinted pieces |
| 882 | dickens | sketches by boz | dickens 882: sketches by boz |
| 883 | dickens | our mutual friend | dickens 883: our mutual friend |
| 888 | dickens | the lazy tour of two idle apprentices | dickens 888: the lazy tour of two idle apprent... |
| 912 | dickens | the mudfog and other sketches | dickens 912: the mudfog and other sketches |
| 914 | dickens | the uncommerical traveller | dickens 914: the uncommerical traveller |
| 916 | dickens | sketches of young couples | dickens 916: sketches of young couples |
| 917 | dickens | barnaby rudge | dickens 917: barnaby rudge |
| 918 | dickens | sketches of young gentlemen | dickens 918: sketches of young gentlemen |
| 922 | dickens | sunday under three heads | dickens 922: sunday under three heads |
| 927 | dickens | the lamplighter | dickens 927: the lamplighter |
| 967 | dickens | nicholas nickleby | dickens 967: nicholas nickleby |
| 968 | dickens | martin chuzzlewit | dickens 968: martin chuzzlewit |
| 1023 | dickens | bleak house | dickens 1023: bleak house |
| 1289 | dickens | three ghost stories | dickens 1289: three ghost stories |
| 1394 | dickens | the holly tree | dickens 1394: the holly tree |
| 1400 | dickens | great expectations | dickens 1400: great expectations |
| 1406 | dickens | the perils of certain english prisoners | dickens 1406: the perils of certain english pr... |
| 1407 | dickens | a message from the sea | dickens 1407: a message from the sea |
| 1413 | dickens | tom tiddlers ground | dickens 1413: tom tiddlers ground |
| 1414 | dickens | somebodys luggage | dickens 1414: somebodys luggage |
| 1415 | dickens | doctor marigold | dickens 1415: doctor marigold |
| 1416 | dickens | mrs lirripers lodgings | dickens 1416: mrs lirripers lodgings |
| 1421 | dickens | mrs lirripers legacy | dickens 1421: mrs lirripers legacy |
| 1435 | dickens | miscellaneous papers | dickens 1435: miscellaneous papers |
| 1467 | dickens | some christmas stories | dickens 1467: some christmas stories |
| 2324 | dickens | a house to let | dickens 2324: a house to let |
| 19337 | dickens | a christmas carol | dickens 19337: a christmas carol |
| 20795 | dickens | the cricket on the hearth | dickens 20795: the cricket on the hearth |
| 27924 | dickens | mugby junction | dickens 27924: mugby junction |
| 35536 | dickens | the poems and verses of charles dickens | dickens 35536: the poems and verses of charles... |
# binary table
L0 = mean_TFIDF_sigs.astype('bool').astype('int')
# Manhattan distance (L1 norm): divide each value by sum down cols
L1 = mean_TFIDF_sigs.apply(lambda x: x / x.sum(), 1)
# Euclidean distance (L2 norm)
L2 = mean_TFIDF_sigs.apply(lambda x: x / norm(x), 1) # Euclidean
assert round(L1.sum(1).sum()) == len(mean_TFIDF_sigs)
assert round(((L2.T)**2).sum().sum()) == len(mean_TFIDF_sigs)
PAIRS)¶mean_TFIDF_sigs.T.corr().stack()
book_id book_id
98 98 1.000000
564 0.182350
580 -0.002878
588 0.053118
644 0.321641
...
35536 2324 0.075841
19337 0.057664
20795 0.008652
27924 0.033643
35536 1.000000
Length: 2500, dtype: float64
# correlation between books --> stack and convert to df with col for raw correlation vals
PAIRS = 1 - mean_TFIDF_sigs.T.corr().stack().to_frame('corr_raw')
# rename indices
PAIRS.index.names = ['doc_a', 'doc_b']
# remove identities (e.g., corr(105, 105) and reverse dupliciates (e.g., corr(105, 121) = corr(121, 105))
PAIRS = PAIRS.query("doc_a > doc_b")
PAIRS
| corr_raw | ||
|---|---|---|
| doc_a | doc_b | |
| 564 | 98 | 0.817650 |
| 580 | 98 | 1.002878 |
| 564 | 0.877140 | |
| 588 | 98 | 0.946882 |
| 564 | 0.787186 | |
| ... | ... | ... |
| 35536 | 1467 | 0.833318 |
| 2324 | 0.924159 | |
| 19337 | 0.942336 | |
| 20795 | 0.991348 | |
| 27924 | 0.966357 |
1225 rows × 1 columns
pdist()¶combos = [
(mean_TFIDF_sigs, 'cityblock', 'cityblock–raw'),
(mean_TFIDF_sigs, 'euclidean', 'euclidean–raw'),
(L2, 'euclidean', 'euclidean–l2'),
(mean_TFIDF_sigs, 'cosine', 'cosine–raw'),
(L1, 'cityblock', 'cityblock–l1'),
(L0, 'jaccard', 'jaccard–l0'),
(L0, 'jensenshannon', 'js–l0'),
(L1, 'jensenshannon', 'js–l1'),
(L2, 'jensenshannon', 'js–l2'),
]
for X, metric, label in combos:
PAIRS[label] = pdist(X, metric)
PAIRS.head(20)
| corr_raw | cityblock–raw | euclidean–raw | euclidean–l2 | cosine–raw | cityblock–l1 | jaccard–l0 | js–l0 | js–l1 | js–l2 | ||
|---|---|---|---|---|---|---|---|---|---|---|---|
| doc_a | doc_b | ||||||||||
| 564 | 98 | 0.817650 | 2.181992 | 0.102131 | 0.705128 | 0.248603 | 0.560497 | 0.017000 | 0.077078 | 0.261065 | 0.261065 |
| 580 | 98 | 1.002878 | 2.289080 | 0.110589 | 0.812632 | 0.330185 | 0.651664 | 0.006000 | 0.045650 | 0.296771 | 0.296771 |
| 564 | 0.877140 | 3.089993 | 0.164050 | 0.906774 | 0.411119 | 0.766355 | 0.047047 | 0.128958 | 0.344255 | 0.344255 | |
| 588 | 98 | 0.946882 | 2.587732 | 0.125556 | 0.770693 | 0.296984 | 0.689925 | 0.123246 | 0.211748 | 0.330926 | 0.330926 |
| 564 | 0.787186 | 2.589356 | 0.115496 | 0.878369 | 0.385766 | 0.828921 | 0.054162 | 0.138488 | 0.367902 | 0.367902 | |
| 580 | 0.721002 | 2.894603 | 0.139821 | 0.776074 | 0.301146 | 0.694771 | 0.134134 | 0.221436 | 0.332535 | 0.332535 | |
| 644 | 98 | 0.678359 | 2.233104 | 0.105889 | 0.811974 | 0.329651 | 0.668478 | 0.010010 | 0.059048 | 0.303062 | 0.303062 |
| 564 | 0.891082 | 3.092410 | 0.154481 | 0.832096 | 0.346192 | 0.731800 | 0.117117 | 0.206221 | 0.344001 | 0.344001 | |
| 580 | 0.985001 | 2.331377 | 0.104579 | 0.846610 | 0.358374 | 0.767106 | 0.036145 | 0.112723 | 0.344390 | 0.344390 | |
| 588 | 0.984815 | 2.082619 | 0.103715 | 0.739933 | 0.273750 | 0.561057 | 0.006000 | 0.045650 | 0.261069 | 0.261069 | |
| 650 | 98 | 0.871224 | 2.152545 | 0.108974 | 0.795297 | 0.316249 | 0.607677 | 0.007000 | 0.049329 | 0.282599 | 0.282599 |
| 564 | 0.975258 | 2.305293 | 0.103135 | 0.679688 | 0.230988 | 0.542875 | 0.006000 | 0.045650 | 0.249071 | 0.249071 | |
| 580 | 1.049085 | 2.750142 | 0.142709 | 0.770513 | 0.296845 | 0.626226 | 0.021021 | 0.085772 | 0.298195 | 0.298195 | |
| 588 | 0.995085 | 4.381819 | 0.222845 | 0.999091 | 0.499091 | 1.055990 | 0.451256 | 0.435729 | 0.494183 | 0.494183 | |
| 644 | 1.040313 | 3.624371 | 0.256947 | 1.177945 | 0.693777 | 1.093038 | 0.394975 | 0.401914 | 0.508821 | 0.508821 | |
| 653 | 98 | 0.723809 | 3.706198 | 0.211257 | 1.039313 | 0.540086 | 1.052495 | 0.409045 | 0.410432 | 0.487383 | 0.487383 |
| 564 | 0.910575 | 1.936816 | 0.089189 | 0.634497 | 0.201293 | 0.502645 | 0.006000 | 0.045650 | 0.233870 | 0.233870 | |
| 580 | 0.828846 | 3.064535 | 0.151449 | 1.008603 | 0.508640 | 0.922165 | 0.060241 | 0.146181 | 0.406321 | 0.406321 | |
| 588 | 0.875357 | 2.195757 | 0.106937 | 0.807463 | 0.325998 | 0.654300 | 0.014014 | 0.069931 | 0.295467 | 0.295467 | |
| 644 | 0.658470 | 2.283973 | 0.101496 | 0.808304 | 0.326678 | 0.703473 | 0.008000 | 0.052754 | 0.311239 | 0.311239 |
LIB['label'] = book_DOC['label']
def hca(sims, title="My Dendrogram", linkage_method='weighted', color_thresh=None, figsize=(15, 20)):
# calculate linkage using given method
tree = sch.linkage(sims, method=linkage_method)
# extract labels (title, year)
labels = LIB.label.values
# set color threshold
if not color_thresh:
color_thresh = pd.DataFrame(tree)[2].median()
# plot dendrograms for each distance metric and linkage method
plt.figure()
fig, axes = plt.subplots(figsize=figsize)
dendrogram = sch.dendrogram(tree,
labels=labels,
orientation="left",
count_sort=True,
distance_sort=True,
above_threshold_color='.75',
color_threshold=color_thresh
)
plt.tick_params(axis='both', which='major', labelsize=14)
fig.suptitle(title, fontsize=20)
for combo in combos:
# column in df (i.e., distance metric)
m = combo[-1]
# two lnikage methods
for l in ['ward','weighted']:
# title: distance metric - linkage method
title = f"{m}–{l}"
hca(PAIRS[m], title, linkage_method=l)
/var/folders/3n/4b11y5qn5cn20kztppfbsxq40000gn/T/ipykernel_24727/163819248.py:14: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
# noun taglist (excluding proper nouns)
noun_tags = ['NN', 'NNS']
SIGS.loc[SIGS.max_pos.isin(noun_tags)].sort_values('dfidf', ascending = False).head(20)
| term_rank | n | n_chars | p | i | max_pos | n_pos | cat_pos | stop | stem_porter | stem_snowball | stem_lancaster | term_rank2 | zipf_k | zipf_k2 | tfidf_mean_chap_max | tfidf_max_chap_max | df | idf | dfidf | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| term_str | ||||||||||||||||||||
| sleep | 594 | 797 | 5 | 0.000160 | 12.606370 | NN | 9 | {'NNP', 'VBD', 'VBG', 'VBP', 'IN', 'VB', 'NN',... | 0 | sleep | sleep | sleep | 538 | 473418 | 428786 | 0.012122 | 0.071449 | 434.0 | 1.445463 | 627.330980 |
| windows | 642 | 747 | 7 | 0.000150 | 12.699841 | NNS | 12 | {'WDT', 'VBD', 'PDT', 'VBP', 'IN', 'RB', 'VBZ'... | 0 | window | window | window | 571 | 479574 | 426537 | 0.010575 | 0.076252 | 433.0 | 1.448791 | 627.326549 |
| top | 638 | 752 | 3 | 0.000151 | 12.690217 | NN | 8 | {'NNP', 'VBD', 'VBP', 'IN', 'VB', 'VBZ', 'NN',... | 0 | top | top | top | 569 | 479776 | 427888 | 0.010093 | 0.056446 | 433.0 | 1.448791 | 627.326549 |
| dress | 659 | 730 | 5 | 0.000147 | 12.733053 | NN | 8 | {'VBP', 'IN', 'RB', 'VB', 'VBZ', 'NN', 'NNS', ... | 0 | dress | dress | dress | 580 | 481070 | 423400 | 0.011136 | 0.056295 | 437.0 | 1.435525 | 627.324360 |
| thank | 565 | 849 | 5 | 0.000171 | 12.515185 | NN | 7 | {'NNP', 'VBD', 'VBP', 'IN', 'VB', 'NN', 'JJ'} | 0 | thank | thank | thank | 515 | 479685 | 437235 | 0.014102 | 0.094442 | 437.0 | 1.435525 | 627.324360 |
| confidence | 633 | 757 | 10 | 0.000152 | 12.680656 | NN | 5 | {'NNP', 'VB', 'NNS', 'NN', 'JJ'} | 0 | confid | confid | confid | 565 | 479181 | 427705 | 0.012530 | 0.079385 | 439.0 | 1.428937 | 627.303427 |
| breast | 660 | 729 | 6 | 0.000147 | 12.735030 | NN | 9 | {'VBD', 'VBN', 'VBP', 'IN', 'RB', 'VBZ', 'VB',... | 0 | breast | breast | breast | 581 | 481140 | 423549 | 0.011534 | 0.054263 | 439.0 | 1.428937 | 627.303427 |
| notice | 728 | 644 | 6 | 0.000130 | 12.913889 | NN | 9 | {'NNP', 'VBN', 'VBP', 'IN', 'RB', 'VB', 'NNS',... | 0 | notic | notic | not | 631 | 468832 | 406364 | 0.009954 | 0.058353 | 430.0 | 1.458821 | 627.293232 |
| duty | 667 | 721 | 4 | 0.000145 | 12.750950 | NN | 9 | {'NNP', 'VBN', 'VBP', 'IN', 'VB', 'NNS', 'NN',... | 0 | duti | duti | duty | 587 | 480907 | 423227 | 0.012026 | 0.054560 | 427.0 | 1.468922 | 627.229720 |
| bless | 639 | 751 | 5 | 0.000151 | 12.692136 | NN | 12 | {'NNP', 'VBD', 'PDT', 'VBP', 'IN', 'RB', 'VB',... | 0 | bless | bless | bless | 570 | 479889 | 428070 | 0.011704 | 0.166571 | 443.0 | 1.415851 | 627.222184 |
| number | 576 | 837 | 6 | 0.000168 | 12.535722 | NN | 8 | {'NNP', 'CC', 'VBP', 'IN', 'VB', 'NNS', 'NN', ... | 0 | number | number | numb | 522 | 482112 | 436914 | 0.013129 | 0.133846 | 426.0 | 1.472305 | 627.201802 |
| stairs | 550 | 878 | 6 | 0.000177 | 12.466728 | NN | 10 | {'NNP', 'VBD', 'PDT', 'VBP', 'RB', 'VBZ', 'VB'... | 0 | stair | stair | stair | 501 | 482900 | 439878 | 0.013373 | 0.058443 | 425.0 | 1.475695 | 627.170498 |
| breath | 701 | 677 | 6 | 0.000136 | 12.841793 | NN | 7 | {'NNP', 'VBD', 'VBN', 'VB', 'NN', 'NNS', 'JJ'} | 0 | breath | breath | brea | 613 | 474577 | 415001 | 0.009623 | 0.058723 | 445.0 | 1.409353 | 627.161993 |
| order | 699 | 682 | 5 | 0.000137 | 12.831177 | NN | 12 | {'NNP', 'PRP', 'VBG', 'VBP', 'IN', 'RB', 'VB',... | 0 | order | order | ord | 611 | 476718 | 416702 | 0.009951 | 0.140935 | 445.0 | 1.409353 | 627.161993 |
| office | 472 | 1021 | 6 | 0.000205 | 12.249038 | NN | 11 | {'NNP', 'PRP', 'VBD', 'VBP', 'IN', 'VBZ', 'VB'... | 0 | offic | offic | off | 444 | 481912 | 453324 | 0.016978 | 0.149610 | 423.0 | 1.482500 | 627.097698 |
| ladies | 378 | 1336 | 6 | 0.000269 | 11.861101 | NNS | 12 | {'NNP', 'VBD', 'NNPS', 'VBN', 'VBP', 'RB', 'VB... | 0 | ladi | ladi | lady | 363 | 505008 | 484968 | 0.024417 | 0.317679 | 423.0 | 1.482500 | 627.097698 |
| paper | 530 | 913 | 5 | 0.000184 | 12.410334 | NN | 9 | {'NNP', 'VBP', 'IN', 'RB', 'VBZ', 'VB', 'NNS',... | 0 | paper | paper | pap | 485 | 483890 | 442805 | 0.012769 | 0.157123 | 447.0 | 1.402883 | 627.088835 |
| knowledge | 703 | 675 | 9 | 0.000136 | 12.846062 | NN | 9 | {'NNP', 'VBD', 'VBP', 'VB', 'VBZ', 'NNS', 'NN'... | 0 | knowledg | knowledg | knowledg | 614 | 474525 | 414450 | 0.011910 | 0.099061 | 422.0 | 1.485915 | 627.056185 |
| instant | 742 | 627 | 7 | 0.000126 | 12.952484 | NN | 6 | {'NNP', 'VBP', 'VBZ', 'VB', 'NN', 'JJ'} | 0 | instant | instant | inst | 638 | 465234 | 400026 | 0.010182 | 0.060240 | 422.0 | 1.485915 | 627.056185 |
| mouth | 684 | 701 | 5 | 0.000141 | 12.791535 | NN | 7 | {'NNP', 'VBD', 'VB', 'VBZ', 'NN', 'NNS', 'JJ'} | 0 | mouth | mouth | mou | 598 | 479484 | 419198 | 0.011229 | 0.059836 | 422.0 | 1.485915 | 627.056185 |
top_20_nouns = list(VOCAB.loc[VOCAB.max_pos.isin(noun_tags)].sort_values('dfidf', ascending = False).head(20).index)
print(top_20_nouns)
['sleep', 'windows', 'top', 'thank', 'dress', 'confidence', 'breast', 'notice', 'duty', 'bless', 'number', 'stairs', 'breath', 'order', 'ladies', 'office', 'paper', 'knowledge', 'instant', 'mouth']
BOW.groupby(BOOKS).mean().sort_values('tfidf', ascending = False).join(LIB, on = 'book_id')
| n | tf | tfidf | source_file_path | title | chap_regex | author | type | year | decade | n_chaps | book_len | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| book_id | |||||||||||||
| 35536 | 2.144928 | 0.037376 | 0.069868 | Dickens/35536-the_poems_and_verses_of_charles_... | the poems and verses of charles dickens | THE VILLAGE COQUETTES$|THE LAMPLIGHTER$|SONGS ... | dickens | stories | 1885 | 1880 | 13 | 10952 | dickens 35536: the poems and verses of charles... |
| 1415 | 2.952430 | 0.044903 | 0.053137 | Dickens/1415-doctor_marigold.txt | doctor marigold | \* \* \* \* \* | dickens | stories | 1865 | 1860 | 2 | 2855 | dickens 1415: doctor marigold |
| 810 | 2.757976 | 0.044831 | 0.048352 | Dickens/810-george_silvermans_explanation.txt | george silvermans explanation | [A-Z]+\sCHAPTER$ | dickens | stories | 1868 | 1860 | 9 | 11065 | dickens 810: george silvermans explanation |
| 918 | 2.391451 | 0.033139 | 0.042115 | Dickens/918-sketches_of_young_gentlemen.txt | sketches of young gentlemen | THE BASHFUL YOUNG GENTLEMAN$|THE OUT-AND-OUT Y... | dickens | stories | 1838 | 1830 | 13 | 17063 | dickens 918: sketches of young gentlemen |
| 916 | 2.559015 | 0.031080 | 0.041771 | Dickens/916-sketches_of_young_couples.txt | sketches of young couples | AN URGENT REMONSTRANCE, &C.$|THE YOUNG COUPLE$... | dickens | stories | 1840 | 1840 | 12 | 18082 | dickens 916: sketches of young couples |
| 824 | 2.618026 | 0.029218 | 0.039669 | Dickens/824-speeches_of_charles_dickens.txt | speeches of charles dickens | [IVXLCM]+\.$ | dickens | non-fiction | 1870 | 1870 | 58 | 87984 | dickens 824: speeches of charles dickens |
| 807 | 2.997925 | 0.039372 | 0.039216 | Dickens/807-hunted_down.txt | hunted down | ^[IVXLCM]+\.$ | dickens | stories | 1859 | 1850 | 5 | 8670 | dickens 807: hunted down |
| 1435 | 3.055027 | 0.018592 | 0.026247 | Dickens/1435-miscellaneous_papers.txt | miscellaneous papers | THE AGRICULTURAL INTEREST$|THREATENING LETTER ... | dickens | non-fiction | 1840 | 1840 | 9 | 23762 | dickens 1435: miscellaneous papers |
| 786 | 4.259530 | 0.021844 | 0.026233 | Dickens/786-hard_times.txt | hard times | CHAPTER\s[IVXLCM]+ | dickens | novel | 1854 | 1850 | 16 | 75760 | dickens 786: hard times |
| 1400 | 3.487522 | 0.024428 | 0.025954 | Dickens/1400-great_expectations.txt | great expectations | ^\s*Chapter\s*[IVXLCM]+ | dickens | novel | 1860 | 1860 | 59 | 185449 | dickens 1400: great expectations |
| 809 | 3.556357 | 0.017571 | 0.023731 | Dickens/809-holiday_romance.txt | holiday romance | ^PART\s[IVXLCM]+\.$ | dickens | stories | 1868 | 1860 | 4 | 13315 | dickens 809: holiday romance |
| 564 | 3.427017 | 0.018254 | 0.023636 | Dickens/564-the_mystery_of_edwin_drood.txt | the mystery of edwin drood | ^CHAPTER\s[IVXLCM]+\.$ | dickens | novel | 1870 | 1870 | 23 | 96378 | dickens 564: the mystery of edwin drood |
| 1413 | 3.486200 | 0.019202 | 0.023386 | Dickens/1413-tom_tiddlers_ground.txt | tom tiddlers ground | ^CHAPTER\s[IVXLCM]+ | dickens | stories | 1861 | 1860 | 3 | 9852 | dickens 1413: tom tiddlers ground |
| 917 | 3.090550 | 0.020404 | 0.022530 | Dickens/917-barnaby_rudge.txt | barnaby rudge | ^Chapter\s([0-9]+|the Last) | dickens | stories | 1841 | 1840 | 82 | 255400 | dickens 917: barnaby rudge |
| 98 | 3.358461 | 0.019807 | 0.021985 | Dickens/98-a_tale_of_two_cities.txt | a tale of two cities | ^\s*CHAPTER\s*[IVXLCM]+\.$ | dickens | novel | 1859 | 1850 | 45 | 137089 | dickens 98: a tale of two cities |
| 730 | 3.099458 | 0.017690 | 0.021117 | Dickens/730-oliver_twist.txt | oliver twist | ^\s*CHAPTER\s*[IVXLCM]+\.$ | dickens | novel | 1837 | 1830 | 53 | 158280 | dickens 730: oliver twist |
| 883 | 3.887494 | 0.017771 | 0.021075 | Dickens/883-our_mutual_friend.txt | our mutual friend | ^\s*Chapter\s* | dickens | novel | 1864 | 1860 | 67 | 328190 | dickens 883: our mutual friend |
| 700 | 3.094307 | 0.019364 | 0.021066 | Dickens/700-the_old_curiosity_shop.txt | the old curiosity shop | ^CHAPTER\s | dickens | novel | 1840 | 1840 | 73 | 218719 | dickens 700: the old curiosity shop |
| 1467 | 3.408788 | 0.018095 | 0.020612 | Dickens/1467-some_christmas_stories.txt | some christmas stories | A CHRISTMAS TREE[\.]?|WHAT CHRISTMAS IS AS WE ... | dickens | stories | 1850 | 1850 | 6 | 20947 | dickens 1467: some christmas stories |
| 872 | 3.231072 | 0.013806 | 0.019746 | Dickens/872-reprinted_pieces.txt | reprinted pieces | THE LONG VOYAGE$|THE BEGGING-LETTER WRITER$|A ... | dickens | stories | 1861 | 1860 | 23 | 91924 | dickens 872: reprinted pieces |
| 967 | 3.605761 | 0.016061 | 0.019325 | Dickens/967-nicholas_nickleby.txt | nicholas nickleby | ^(AUTHOR’S PREFACE|CHAPTER\s[0-9]+|Conclusion$) | dickens | novel | 1838 | 1830 | 66 | 326224 | dickens 967: nicholas nickleby |
| 882 | 3.025790 | 0.013125 | 0.019293 | Dickens/882-sketches_by_boz.txt | sketches by boz | ^(PREFACE|CHAPTER\s[IVXLCM]+) | dickens | stories | 1836 | 1830 | 57 | 184201 | dickens 882: sketches by boz |
| 2324 | 4.272909 | 0.015482 | 0.019035 | Dickens/2324-a_house_to_let.txt | a house to let | OVER THE WAY$|THE MANCHESTER MARRIAGE$|GOING I... | dickens | stories | 1858 | 1850 | 6 | 34132 | dickens 2324: a house to let |
| 1394 | 3.814458 | 0.016505 | 0.018806 | Dickens/1394-the_holly_tree.txt | the holly tree | ^[A-Z]+\sBRANCH | dickens | stories | 1855 | 1850 | 3 | 13877 | dickens 1394: the holly tree |
| 1023 | 4.023658 | 0.016917 | 0.018651 | Dickens/1023-bleak_house.txt | bleak house | ^\s*(PREFACE|CHAPTER\s*[IVXLCM]+)$ | dickens | novel | 1852 | 1850 | 68 | 357325 | dickens 1023: bleak house |
| 766 | 4.164960 | 0.018244 | 0.018586 | Dickens/766-david_copperfield.txt | david copperfield | \s*(PREFACE\sTO|CHAPTER\s*[0-9]*) | dickens | novel | 1849 | 1840 | 66 | 358375 | dickens 766: david copperfield |
| 914 | 3.062025 | 0.012692 | 0.018262 | Dickens/914-the_uncommerical_traveller.txt | the uncommerical traveller | ^[IVXLCM]+$ | dickens | non-fiction | 1860 | 1860 | 37 | 144157 | dickens 914: the uncommerical traveller |
| 1421 | 4.763350 | 0.015414 | 0.017159 | Dickens/1421-mrs_lirripers_legacy.txt | mrs lirripers legacy | ^CHAPTER\s[IVXLCM]+ | dickens | stories | 1864 | 1860 | 2 | 12399 | dickens 1421: mrs lirripers legacy |
| 1414 | 3.561426 | 0.012625 | 0.017034 | Dickens/1414-somebodys_luggage.txt | somebodys luggage | ^CHAPTER\s[IVXLCM]+ | dickens | stories | 1862 | 1860 | 4 | 19684 | dickens 1414: somebodys luggage |
| 821 | 3.849948 | 0.014126 | 0.016953 | Dickens/821-dombey_and_sons.txt | dombey and sons | ^\s*CHAPTER\s*[IVXLCM]+\.$ | dickens | novel | 1846 | 1840 | 62 | 356382 | dickens 821: dombey and sons |
| 699 | 3.808958 | 0.012132 | 0.016704 | Dickens/699-a_childs_history_of_england.txt | a childs history of england | ^CHAPTER\s[IVXLCM]+$ | dickens | non-fiction | 1853 | 1850 | 37 | 163271 | dickens 699: a childs history of england |
| 1407 | 3.729649 | 0.014084 | 0.016403 | Dickens/1407-a_message_from_the_sea.txt | a message from the sea | ^CHAPTER\s[IVXLCM]+ | dickens | stories | 1860 | 1860 | 3 | 12416 | dickens 1407: a message from the sea |
| 912 | 3.246902 | 0.010833 | 0.016324 | Dickens/912-the_mudfog_and_other_sketches.txt | the mudfog and other sketches | PUBLIC LIFE OF MR. TULRUMBLE$|FULL REPORT OF T... | dickens | stories | 1837 | 1830 | 7 | 30917 | dickens 912: the mudfog and other sketches |
| 968 | 3.960520 | 0.014513 | 0.016247 | Dickens/968-martin_chuzzlewit.txt | martin chuzzlewit | ^(PREFACE|CHAPTER\s[A-Z]+[-]?[A-Z]+$) | dickens | novel | 1842 | 1840 | 55 | 340276 | dickens 968: martin chuzzlewit |
| 580 | 3.630811 | 0.011790 | 0.015913 | Dickens/580-the_pickwick_papers.txt | the pickwick papers | ^CHAPTER\s[IVXLCM]+\.\s[A-Z]+ | dickens | novel | 1836 | 1830 | 57 | 302570 | dickens 580: the pickwick papers |
| 922 | 2.866613 | 0.010350 | 0.015044 | Dickens/922-sunday_under_three_heads.txt | sunday under three heads | ^[IVXLCM]+$ | dickens | non-fiction | 1836 | 1830 | 3 | 10767 | dickens 922: sunday under three heads |
| 588 | 3.900265 | 0.011998 | 0.014644 | Dickens/588-master_humphreys_clock.txt | master humphreys clock | ^(?:[IVXLCM]+$|TO THE READERS OF) | dickens | stories | 1840 | 1840 | 7 | 47084 | dickens 588: master humphreys clock |
| 19337 | 3.903588 | 0.012571 | 0.014323 | Dickens/19337-a_christmas_carol.txt | a christmas carol | ^\s*STAVE\s[A-Z]+$ | dickens | novel | 1843 | 1840 | 5 | 28828 | dickens 19337: a christmas carol |
| 1416 | 5.168994 | 0.012984 | 0.014209 | Dickens/1416-mrs_lirripers_lodgings.txt | mrs lirripers lodgings | ^CHAPTER\s[IVXLCM]+ | dickens | stories | 1863 | 1860 | 2 | 14437 | dickens 1416: mrs lirripers lodgings |
| 653 | 4.204897 | 0.011646 | 0.013845 | Dickens/653-the_chimes.txt | the chimes | ^CHAPTER\s[IVXLCM]+ | dickens | novel | 1844 | 1840 | 4 | 30742 | dickens 653: the chimes |
| 27924 | 4.389395 | 0.012060 | 0.013294 | Dickens/27924-mugby_junction.txt | mugby junction | BARBOX BROTHERS$|BARBOX BROTHERS AND CO\.$|MAI... | dickens | stories | 1866 | 1860 | 7 | 50083 | dickens 27924: mugby junction |
| 927 | 4.405577 | 0.011503 | 0.013199 | Dickens/927-the_lamplighter.txt | the lamplighter | ^‘IF | dickens | stories | 1838 | 1830 | 1 | 6952 | dickens 927: the lamplighter |
| 675 | 3.382945 | 0.009514 | 0.012563 | Dickens/675-american_notes.txt | american notes | ^CHAPTER\s[IVXLCM]+$ | dickens | non-fiction | 1842 | 1840 | 18 | 103305 | dickens 675: american notes |
| 676 | 4.515977 | 0.010164 | 0.012512 | Dickens/676-the_battle_of_life.txt | the battle of life | ^Part the [A-Z][a-z]+$ | dickens | novel | 1846 | 1840 | 3 | 29679 | dickens 676: the battle of life |
| 650 | 3.629300 | 0.007702 | 0.011398 | Dickens/650-pictures_from_italy.txt | pictures from italy | THE READER’S PASSPORT|GOING THROUGH FRANCE|LYO... | dickens | non-fiction | 1846 | 1840 | 11 | 73007 | dickens 650: pictures from italy |
| 1406 | 5.571753 | 0.009851 | 0.010948 | Dickens/1406-the_perils_of_certain_english_pri... | the perils of certain english prisoners | ^CHAPTER\s[IVXLCM]+ | dickens | stories | 1857 | 1850 | 2 | 19646 | dickens 1406: the perils of certain english pr... |
| 20795 | 4.899202 | 0.009398 | 0.010577 | Dickens/20795-the_cricket_on_the_hearth.txt | the cricket on the hearth | ^CHIRP\sTHE | dickens | novel | 1845 | 1840 | 3 | 31933 | dickens 20795: the cricket on the hearth |
| 1289 | 4.083012 | 0.009729 | 0.010370 | Dickens/1289-three_ghost_stories.txt | three ghost stories | THE HAUNTED HOUSE\.|THE TRIAL FOR MURDER\.|THE... | dickens | stories | 1860 | 1860 | 3 | 21150 | dickens 1289: three ghost stories |
| 644 | 4.975638 | 0.008870 | 0.009827 | Dickens/644-the_haunted_man_and_the_ghosts_bar... | the haunted man and the ghosts bargain | ^CHAPTER\s[IVXLCM]+$ | dickens | stories | 1848 | 1840 | 3 | 33904 | dickens 644: the haunted man and the ghosts ba... |
| 888 | 4.098543 | 0.007341 | 0.009703 | Dickens/888-the_lazy_tour_of_two_idle_apprenti... | the lazy tour of two idle apprentices | CHAPTER\s[IVXLCM]+$ | dickens | stories | 1857 | 1850 | 5 | 40510 | dickens 888: the lazy tour of two idle apprent... |
# merge PAIRS, LIB to add label col twice (for doc_a, doc_b) to include author, book_id, title
DISTS = pd.merge(PAIRS.reset_index(), LIB['label'], left_on = 'doc_a', right_on = 'book_id', how = 'left')
DISTS = pd.merge(DISTS, LIB['label'], left_on = 'doc_b', right_on = 'book_id', how = 'right')
DISTS = DISTS.set_index(['doc_a', 'doc_b']).rename({'label_x': 'label_a', 'label_y': 'label_b'}, axis = 1)
# reorder df columns so that label_a and label_b first
DISTS.insert(loc = 0, column = 'label_a', value = DISTS.pop('label_a'))
DISTS.insert(loc = 1, column = 'label_b', value = DISTS.pop('label_b'))
DISTS.head(20).style.background_gradient(cmap='YlGnBu', high=.5, axis=0)
| label_a | label_b | corr_raw | cityblock–raw | euclidean–raw | euclidean–l2 | cosine–raw | cityblock–l1 | jaccard–l0 | js–l0 | js–l1 | js–l2 | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| doc_a | doc_b | ||||||||||||
| 564.0 | 98.0 | dickens 564: the mystery of edwin drood | dickens 98: a tale of two cities | 0.817650 | 2.181992 | 0.102131 | 0.705128 | 0.248603 | 0.560497 | 0.017000 | 0.077078 | 0.261065 | 0.261065 |
| 580.0 | 98.0 | dickens 580: the pickwick papers | dickens 98: a tale of two cities | 1.002878 | 2.289080 | 0.110589 | 0.812632 | 0.330185 | 0.651664 | 0.006000 | 0.045650 | 0.296771 | 0.296771 |
| 588.0 | 98.0 | dickens 588: master humphreys clock | dickens 98: a tale of two cities | 0.946882 | 2.587732 | 0.125556 | 0.770693 | 0.296984 | 0.689925 | 0.123246 | 0.211748 | 0.330926 | 0.330926 |
| 644.0 | 98.0 | dickens 644: the haunted man and the ghosts bargain | dickens 98: a tale of two cities | 0.678359 | 2.233104 | 0.105889 | 0.811974 | 0.329651 | 0.668478 | 0.010010 | 0.059048 | 0.303062 | 0.303062 |
| 650.0 | 98.0 | dickens 650: pictures from italy | dickens 98: a tale of two cities | 0.871224 | 2.152545 | 0.108974 | 0.795297 | 0.316249 | 0.607677 | 0.007000 | 0.049329 | 0.282599 | 0.282599 |
| 653.0 | 98.0 | dickens 653: the chimes | dickens 98: a tale of two cities | 0.723809 | 3.706198 | 0.211257 | 1.039313 | 0.540086 | 1.052495 | 0.409045 | 0.410432 | 0.487383 | 0.487383 |
| 675.0 | 98.0 | dickens 675: american notes | dickens 98: a tale of two cities | 0.955682 | 2.137805 | 0.098926 | 0.789208 | 0.311424 | 0.679537 | 0.089089 | 0.178885 | 0.318723 | 0.318723 |
| 676.0 | 98.0 | dickens 676: the battle of life | dickens 98: a tale of two cities | 0.820688 | 4.551075 | 0.340992 | 1.198773 | 0.718528 | 1.208867 | 0.536072 | 0.486412 | 0.557537 | 0.557537 |
| 699.0 | 98.0 | dickens 699: a childs history of england | dickens 98: a tale of two cities | 0.769068 | 3.010372 | 0.147643 | 0.945397 | 0.446888 | 0.905983 | 0.348000 | 0.373570 | 0.443916 | 0.443916 |
| 700.0 | 98.0 | dickens 700: the old curiosity shop | dickens 98: a tale of two cities | 0.902024 | 2.798888 | 0.154623 | 0.932323 | 0.434613 | 0.778788 | 0.148297 | 0.233428 | 0.368973 | 0.368973 |
| 730.0 | 98.0 | dickens 730: oliver twist | dickens 98: a tale of two cities | 0.968183 | 3.252808 | 0.153514 | 0.819351 | 0.335668 | 0.742794 | 0.120240 | 0.209290 | 0.346299 | 0.346299 |
| 766.0 | 98.0 | dickens 766: david copperfield | dickens 98: a tale of two cities | 0.873235 | 2.471151 | 0.110196 | 0.762618 | 0.290793 | 0.643111 | 0.019019 | 0.081576 | 0.293994 | 0.293994 |
| 786.0 | 98.0 | dickens 786: hard times | dickens 98: a tale of two cities | 0.726354 | 2.067851 | 0.095040 | 0.624151 | 0.194782 | 0.489195 | 0.011000 | 0.061867 | 0.234257 | 0.234257 |
| 807.0 | 98.0 | dickens 807: hunted down | dickens 98: a tale of two cities | 0.886397 | 3.223907 | 0.144618 | 0.872691 | 0.380794 | 0.845872 | 0.197990 | 0.272609 | 0.394755 | 0.394755 |
| 809.0 | 98.0 | dickens 809: holiday romance | dickens 98: a tale of two cities | 1.017582 | 1.588135 | 0.085116 | 0.607068 | 0.184266 | 0.439615 | 0.001000 | 0.018620 | 0.205895 | 0.205895 |
| 810.0 | 98.0 | dickens 810: george silvermans explanation | dickens 98: a tale of two cities | 0.867269 | 1.997594 | 0.095671 | 0.666314 | 0.221987 | 0.497489 | 0.000000 | 0.000000 | 0.231578 | 0.231578 |
| 821.0 | 98.0 | dickens 821: dombey and sons | dickens 98: a tale of two cities | 0.754086 | 4.478313 | 0.333909 | 1.141066 | 0.651016 | 1.177295 | 0.395000 | 0.401810 | 0.526053 | 0.526053 |
| 824.0 | 98.0 | dickens 824: speeches of charles dickens | dickens 98: a tale of two cities | 1.046565 | 3.688435 | 0.195294 | 0.969963 | 0.470414 | 0.821025 | 0.052209 | 0.136182 | 0.373178 | 0.373178 |
| 872.0 | 98.0 | dickens 872: reprinted pieces | dickens 98: a tale of two cities | 0.898753 | 2.890082 | 0.156784 | 0.846636 | 0.358396 | 0.658543 | 0.043000 | 0.123040 | 0.307073 | 0.307073 |
| 882.0 | 98.0 | dickens 882: sketches by boz | dickens 98: a tale of two cities | 0.965408 | 3.453197 | 0.165532 | 1.049204 | 0.550414 | 1.013267 | 0.160282 | 0.245258 | 0.454803 | 0.454803 |
ZPAIRS = (PAIRS - PAIRS.mean()) / PAIRS.std()
ZPAIRS
| corr_raw | cityblock–raw | euclidean–raw | euclidean–l2 | cosine–raw | cityblock–l1 | jaccard–l0 | js–l0 | js–l1 | js–l2 | ||
|---|---|---|---|---|---|---|---|---|---|---|---|
| doc_a | doc_b | ||||||||||
| 564 | 98 | -0.793377 | -1.379250 | -1.117107 | -1.680734 | -1.584085 | -1.578464 | -1.319745 | -1.474008 | -1.626327 | -1.626327 |
| 580 | 98 | 0.814111 | -1.272129 | -1.018769 | -1.038081 | -1.070754 | -1.220194 | -1.375355 | -1.674885 | -1.304902 | -1.304902 |
| 564 | -0.277098 | -0.470971 | -0.397164 | -0.475308 | -0.561506 | -0.769479 | -1.167842 | -1.142406 | -0.877457 | -0.877457 | |
| 588 | 98 | 0.328151 | -0.973385 | -0.844751 | -1.288790 | -1.279663 | -1.069836 | -0.782614 | -0.613239 | -0.997442 | -0.997442 |
| 564 | -1.057759 | -0.971761 | -0.961709 | -0.645112 | -0.721034 | -0.523605 | -1.131869 | -1.081498 | -0.664586 | -0.664586 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 35536 | 1467 | -0.657405 | -0.504204 | -0.349307 | -0.039757 | -0.129099 | -0.546136 | -0.569091 | -0.375189 | -0.441440 | -0.441440 |
| 2324 | 0.130957 | 0.506804 | 0.090930 | 0.862431 | 0.872827 | 1.082217 | 0.934007 | 0.907707 | 0.998858 | 0.998858 | |
| 19337 | 0.288699 | -0.386278 | -0.352837 | -0.288472 | -0.380110 | -0.681064 | -0.782686 | -0.604878 | -0.664081 | -0.664081 | |
| 20795 | 0.714052 | 0.752296 | 0.177578 | 0.899015 | 0.916479 | 1.025788 | 0.973376 | 0.927963 | 0.984284 | 0.984284 | |
| 27924 | 0.497169 | 0.272635 | -0.168241 | 0.616982 | 0.586053 | 0.840609 | 0.850490 | 0.810887 | 0.781297 | 0.781297 |
1225 rows × 10 columns
ZPAIRS.T.sort_index().T.plot.box(rot = 45, figsize = (15,7));
sns.pairplot(ZPAIRS);
n_clusters = 4
# instantiate KMeans model
km = KMeans(n_clusters, random_state = 314)
# compute cluster centers and predict cluster index for each sample using raw and normalized feature vectors
book_DOC['y_raw'] = km.fit_predict(mean_TFIDF_sigs)
book_DOC['y_L0'] = km.fit_predict(L0)
book_DOC['y_L1'] = km.fit_predict(L1)
book_DOC['y_L2'] = km.fit_predict(L2)
book_DOC.iloc[:,1:].sort_values('label').style.background_gradient(cmap = 'RdBu')
| title | label | y_raw | y_L0 | y_L1 | y_L2 | |
|---|---|---|---|---|---|---|
| book_id | ||||||
| 1023 | bleak house | dickens 1023: bleak house | 3 | 0 | 0 | 3 |
| 1289 | three ghost stories | dickens 1289: three ghost stories | 3 | 0 | 0 | 0 |
| 1394 | the holly tree | dickens 1394: the holly tree | 3 | 2 | 0 | 3 |
| 1400 | great expectations | dickens 1400: great expectations | 3 | 0 | 0 | 3 |
| 1406 | the perils of certain english prisoners | dickens 1406: the perils of certain english prisoners | 3 | 2 | 0 | 3 |
| 1407 | a message from the sea | dickens 1407: a message from the sea | 3 | 2 | 0 | 0 |
| 1413 | tom tiddlers ground | dickens 1413: tom tiddlers ground | 3 | 2 | 0 | 3 |
| 1414 | somebodys luggage | dickens 1414: somebodys luggage | 3 | 2 | 0 | 0 |
| 1415 | doctor marigold | dickens 1415: doctor marigold | 0 | 3 | 2 | 1 |
| 1416 | mrs lirripers lodgings | dickens 1416: mrs lirripers lodgings | 1 | 2 | 3 | 2 |
| 1421 | mrs lirripers legacy | dickens 1421: mrs lirripers legacy | 1 | 2 | 3 | 2 |
| 1435 | miscellaneous papers | dickens 1435: miscellaneous papers | 3 | 0 | 0 | 0 |
| 1467 | some christmas stories | dickens 1467: some christmas stories | 3 | 0 | 0 | 0 |
| 19337 | a christmas carol | dickens 19337: a christmas carol | 3 | 0 | 0 | 3 |
| 20795 | the cricket on the hearth | dickens 20795: the cricket on the hearth | 3 | 0 | 0 | 3 |
| 2324 | a house to let | dickens 2324: a house to let | 3 | 0 | 0 | 3 |
| 27924 | mugby junction | dickens 27924: mugby junction | 3 | 0 | 0 | 0 |
| 35536 | the poems and verses of charles dickens | dickens 35536: the poems and verses of charles dickens | 3 | 1 | 0 | 0 |
| 564 | the mystery of edwin drood | dickens 564: the mystery of edwin drood | 3 | 0 | 0 | 3 |
| 580 | the pickwick papers | dickens 580: the pickwick papers | 3 | 0 | 0 | 3 |
| 588 | master humphreys clock | dickens 588: master humphreys clock | 3 | 0 | 0 | 3 |
| 644 | the haunted man and the ghosts bargain | dickens 644: the haunted man and the ghosts bargain | 3 | 0 | 0 | 3 |
| 650 | pictures from italy | dickens 650: pictures from italy | 3 | 0 | 0 | 0 |
| 653 | the chimes | dickens 653: the chimes | 3 | 0 | 0 | 3 |
| 675 | american notes | dickens 675: american notes | 3 | 0 | 0 | 0 |
| 676 | the battle of life | dickens 676: the battle of life | 3 | 0 | 0 | 3 |
| 699 | a childs history of england | dickens 699: a childs history of england | 3 | 0 | 0 | 0 |
| 700 | the old curiosity shop | dickens 700: the old curiosity shop | 3 | 0 | 0 | 3 |
| 730 | oliver twist | dickens 730: oliver twist | 3 | 0 | 0 | 3 |
| 766 | david copperfield | dickens 766: david copperfield | 3 | 0 | 0 | 3 |
| 786 | hard times | dickens 786: hard times | 3 | 0 | 0 | 3 |
| 807 | hunted down | dickens 807: hunted down | 3 | 2 | 0 | 3 |
| 809 | holiday romance | dickens 809: holiday romance | 3 | 2 | 1 | 0 |
| 810 | george silvermans explanation | dickens 810: george silvermans explanation | 3 | 2 | 0 | 3 |
| 821 | dombey and sons | dickens 821: dombey and sons | 3 | 0 | 0 | 3 |
| 824 | speeches of charles dickens | dickens 824: speeches of charles dickens | 3 | 0 | 0 | 0 |
| 872 | reprinted pieces | dickens 872: reprinted pieces | 3 | 0 | 0 | 0 |
| 882 | sketches by boz | dickens 882: sketches by boz | 3 | 0 | 0 | 0 |
| 883 | our mutual friend | dickens 883: our mutual friend | 3 | 0 | 0 | 3 |
| 888 | the lazy tour of two idle apprentices | dickens 888: the lazy tour of two idle apprentices | 3 | 0 | 0 | 0 |
| 912 | the mudfog and other sketches | dickens 912: the mudfog and other sketches | 3 | 0 | 0 | 0 |
| 914 | the uncommerical traveller | dickens 914: the uncommerical traveller | 3 | 0 | 0 | 0 |
| 916 | sketches of young couples | dickens 916: sketches of young couples | 2 | 0 | 0 | 3 |
| 917 | barnaby rudge | dickens 917: barnaby rudge | 3 | 0 | 0 | 3 |
| 918 | sketches of young gentlemen | dickens 918: sketches of young gentlemen | 3 | 0 | 0 | 3 |
| 922 | sunday under three heads | dickens 922: sunday under three heads | 3 | 2 | 0 | 0 |
| 927 | the lamplighter | dickens 927: the lamplighter | 1 | 3 | 3 | 2 |
| 967 | nicholas nickleby | dickens 967: nicholas nickleby | 3 | 0 | 0 | 3 |
| 968 | martin chuzzlewit | dickens 968: martin chuzzlewit | 3 | 0 | 0 | 3 |
| 98 | a tale of two cities | dickens 98: a tale of two cities | 3 | 0 | 0 | 3 |
# k values to test
k_vals = list(range(2, 11))
# different feature vectors to use
feature_vectors = {'raw': mean_TFIDF_sigs,
'L0': L0,
'L1': L1,
'L2': L2}
# empty dataframe
km_results = pd.DataFrame(columns = ['k', 'raw_silhouette_score', 'L0_silhouette_score', 'L1_silhouette_score', 'L2_silhouette_score'])
# loop through k values (num of clusters) and compute silhouette score to find best of combo of k, feature vector
for k in k_vals:
km = KMeans(k, random_state = 314)
results = [k]
for vec in feature_vectors.values():
labels = km.fit_predict(vec)
results.append(silhouette_score(vec, labels))
km_results.loc[len(km_results)] = results
km_results.style.background_gradient(cmap = 'RdBu', axis = None, subset = km_results.columns[1:])
| k | raw_silhouette_score | L0_silhouette_score | L1_silhouette_score | L2_silhouette_score | |
|---|---|---|---|---|---|
| 0 | 2.000000 | 0.434304 | 0.375481 | 0.416088 | 0.153934 |
| 1 | 3.000000 | 0.447782 | 0.350379 | 0.424456 | 0.062581 |
| 2 | 4.000000 | 0.316780 | 0.355775 | 0.405744 | 0.063818 |
| 3 | 5.000000 | 0.313854 | 0.356422 | 0.046932 | 0.076669 |
| 4 | 6.000000 | 0.045889 | 0.352762 | 0.247122 | 0.070491 |
| 5 | 7.000000 | 0.049817 | 0.355153 | 0.252137 | 0.068152 |
| 6 | 8.000000 | 0.005861 | 0.317736 | 0.066262 | 0.046934 |
| 7 | 9.000000 | 0.048023 | 0.342503 | 0.253976 | 0.052131 |
| 8 | 10.000000 | 0.065307 | 0.298556 | 0.082754 | 0.066405 |
# overall highest silhouette score
max_silhouette_score = km_results.iloc[:,1:].max().max()
# k value (num of clusters) corresponding to the highest silhouette score
max_score_cluster = km_results.loc[km_results[km_results == max_silhouette_score].any(axis = 1)]['k'].iloc[0]
# feature vector corresponding to the highest silhouette score
max_score_vec = km_results.loc[km_results[km_results == max_silhouette_score].any(axis = 1)].iloc[:, 1:].idxmax(axis = 1).iloc[0]
max_score_vec
'raw_silhouette_score'
# create a col with labels corresponding to k value, feature vector that yield highest silhouette score
km = KMeans(int(max_score_cluster), random_state = 314)
max_col_name = 'max_y_{}'.format(max_score_vec.split('_')[0])
book_DOC[max_col_name] = km.fit_predict(feature_vectors[max_score_vec.split('_')[0]])
# add to see cluster breakdown by type
book_DOC = book_DOC.join(LIB['type'])
book_DOC[['label', 'type', max_col_name]].sort_values(max_col_name).style.background_gradient(cmap = 'RdBu')
| label | type | max_y_raw | |
|---|---|---|---|
| book_id | |||
| 927 | dickens 927: the lamplighter | stories | 0 |
| 1421 | dickens 1421: mrs lirripers legacy | stories | 0 |
| 1416 | dickens 1416: mrs lirripers lodgings | stories | 0 |
| 98 | dickens 98: a tale of two cities | novel | 1 |
| 917 | dickens 917: barnaby rudge | stories | 1 |
| 918 | dickens 918: sketches of young gentlemen | stories | 1 |
| 922 | dickens 922: sunday under three heads | non-fiction | 1 |
| 967 | dickens 967: nicholas nickleby | novel | 1 |
| 968 | dickens 968: martin chuzzlewit | novel | 1 |
| 1023 | dickens 1023: bleak house | novel | 1 |
| 1289 | dickens 1289: three ghost stories | stories | 1 |
| 1394 | dickens 1394: the holly tree | stories | 1 |
| 1406 | dickens 1406: the perils of certain english prisoners | stories | 1 |
| 916 | dickens 916: sketches of young couples | stories | 1 |
| 1407 | dickens 1407: a message from the sea | stories | 1 |
| 1413 | dickens 1413: tom tiddlers ground | stories | 1 |
| 1414 | dickens 1414: somebodys luggage | stories | 1 |
| 1435 | dickens 1435: miscellaneous papers | non-fiction | 1 |
| 1467 | dickens 1467: some christmas stories | stories | 1 |
| 2324 | dickens 2324: a house to let | stories | 1 |
| 19337 | dickens 19337: a christmas carol | novel | 1 |
| 20795 | dickens 20795: the cricket on the hearth | novel | 1 |
| 1400 | dickens 1400: great expectations | novel | 1 |
| 27924 | dickens 27924: mugby junction | stories | 1 |
| 914 | dickens 914: the uncommerical traveller | non-fiction | 1 |
| 888 | dickens 888: the lazy tour of two idle apprentices | stories | 1 |
| 564 | dickens 564: the mystery of edwin drood | novel | 1 |
| 580 | dickens 580: the pickwick papers | novel | 1 |
| 588 | dickens 588: master humphreys clock | stories | 1 |
| 644 | dickens 644: the haunted man and the ghosts bargain | stories | 1 |
| 650 | dickens 650: pictures from italy | non-fiction | 1 |
| 653 | dickens 653: the chimes | novel | 1 |
| 675 | dickens 675: american notes | non-fiction | 1 |
| 676 | dickens 676: the battle of life | novel | 1 |
| 699 | dickens 699: a childs history of england | non-fiction | 1 |
| 912 | dickens 912: the mudfog and other sketches | stories | 1 |
| 700 | dickens 700: the old curiosity shop | novel | 1 |
| 766 | dickens 766: david copperfield | novel | 1 |
| 786 | dickens 786: hard times | novel | 1 |
| 807 | dickens 807: hunted down | stories | 1 |
| 809 | dickens 809: holiday romance | stories | 1 |
| 810 | dickens 810: george silvermans explanation | stories | 1 |
| 821 | dickens 821: dombey and sons | novel | 1 |
| 824 | dickens 824: speeches of charles dickens | non-fiction | 1 |
| 872 | dickens 872: reprinted pieces | stories | 1 |
| 882 | dickens 882: sketches by boz | stories | 1 |
| 883 | dickens 883: our mutual friend | novel | 1 |
| 730 | dickens 730: oliver twist | novel | 1 |
| 35536 | dickens 35536: the poems and verses of charles dickens | stories | 1 |
| 1415 | dickens 1415: doctor marigold | stories | 2 |
book_DOC.groupby(max_col_name).size()
max_y_raw 0 3 1 46 2 1 dtype: int64
# cluster breakdown by type
book_DOC.groupby(['type', max_col_name]).size()
type max_y_raw
non-fiction 1 7
novel 1 17
stories 0 3
1 22
2 1
dtype: int64
chap_DOC = pd.DataFrame(index = TFIDF.index)
chap_DOC = chap_DOC.join(LIB[['author', 'title', 'type', 'decade']], on = 'book_id')
chap_DOC['label'] = chap_DOC.apply(lambda x: "{}-{}--{}-{}".format(x.name[0], x.author, x.title, x.name[1]), 1)
chap_DOC['mean_tfidf'] = TFIDF.mean(1)
chap_DOC['n_tokens'] = BOW.groupby(OHCO[:2]).n.sum()
chap_DOC
| author | title | type | decade | label | mean_tfidf | n_tokens | ||
|---|---|---|---|---|---|---|---|---|
| book_id | chap_id | |||||||
| 98 | 1 | dickens | a tale of two cities | novel | 1850 | 98-dickens--a tale of two cities-1 | 0.000363 | 1017 |
| 2 | dickens | a tale of two cities | novel | 1850 | 98-dickens--a tale of two cities-2 | 0.000233 | 2044 | |
| 3 | dickens | a tale of two cities | novel | 1850 | 98-dickens--a tale of two cities-3 | 0.000269 | 1638 | |
| 4 | dickens | a tale of two cities | novel | 1850 | 98-dickens--a tale of two cities-4 | 0.000422 | 4439 | |
| 5 | dickens | a tale of two cities | novel | 1850 | 98-dickens--a tale of two cities-5 | 0.000330 | 4218 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 35536 | 9 | dickens | the poems and verses of charles dickens | stories | 1880 | 35536-dickens--the poems and verses of charles... | 0.000507 | 942 |
| 10 | dickens | the poems and verses of charles dickens | stories | 1880 | 35536-dickens--the poems and verses of charles... | 0.000797 | 356 | |
| 11 | dickens | the poems and verses of charles dickens | stories | 1880 | 35536-dickens--the poems and verses of charles... | 0.000327 | 685 | |
| 12 | dickens | the poems and verses of charles dickens | stories | 1880 | 35536-dickens--the poems and verses of charles... | 0.000338 | 547 | |
| 13 | dickens | the poems and verses of charles dickens | stories | 1880 | 35536-dickens--the poems and verses of charles... | 0.000671 | 439 |
1182 rows × 7 columns
LOADINGS, DCM, COMPINF = get_pca(TFIDF_sigs, norm_docs = True, center_by_mean = False, center_by_variance = False)
px.scatter(DCM, 0, 1, color=chap_DOC.type,
size=np.abs(chap_DOC.mean_tfidf), hover_name=chap_DOC.label,
marginal_x='box', marginal_y='box', height=1000)
X = LOADINGS.join(SIGS, how='inner').reset_index()
px.scatter(X, 0, 1, size=X.n, color=X.dfidf,
hover_name='term_str', hover_data=['max_pos'],
marginal_x='box', marginal_y='box',
height=1000, width=1000)
COMPINF
| pos | neg | eig_val | exp_var | |
|---|---|---|---|---|
| pc_id | ||||
| 0 | ha maam rejoined inquired aint | public ladies sea gentlemen houses | 0.017467 | 0.156175 |
| 1 | says takes aint em o | child loved brother father sister | 0.016741 | 0.149683 |
| 2 | maam sister ladies brother marriage | ha horses crowd road river | 0.014116 | 0.126209 |
| 3 | ha ladies gentlemen inquired coach | says child father mother loved | 0.013638 | 0.121941 |
| 4 | coach maam ladies shop town | ha eh brother o rejoined | 0.010121 | 0.090495 |
| 5 | maam mother child boys boy | ha gentlemen brother court office | 0.009784 | 0.087477 |
| 6 | brother son maam court father | child ladies love happy girl | 0.008110 | 0.072511 |
| 7 | sister boy aint o boys | ha maam gentlemen ladies says | 0.007541 | 0.067423 |
| 8 | ladies boys father boy son | maam letter office river reference | 0.007492 | 0.066984 |
| 9 | ha sea sister school children | crowd girl gentlemen rejoined inquired | 0.006834 | 0.061103 |
pca = PCA(
n_components=6,
n_iter=3,
rescale_with_mean=False, # Already set and applied to TFIDF
rescale_with_std=False, # Already set and applied to TFIDF
copy=True,
check_input=True,
engine='auto',
random_state=42
)
pca = pca.fit(TFIDF)
dcm = pca.transform(TFIDF)
dcm
| 0 | 1 | 2 | 3 | 4 | 5 | ||
|---|---|---|---|---|---|---|---|
| book_id | chap_id | ||||||
| 98 | 1 | 0.079602 | -0.047823 | -0.032369 | 0.022100 | -0.028963 | -0.002443 |
| 2 | 0.080926 | -0.050889 | -0.033654 | 0.017720 | -0.035551 | 0.008480 | |
| 3 | 0.077527 | -0.043145 | -0.028152 | 0.018199 | -0.025712 | 0.004466 | |
| 4 | 0.176733 | -0.106757 | -0.057149 | 0.045436 | -0.043284 | 0.005294 | |
| 5 | 0.108728 | -0.070882 | -0.018355 | 0.023081 | -0.020364 | 0.021739 | |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 35536 | 9 | 0.125069 | -0.072843 | -0.055778 | 0.018766 | -0.043503 | 0.008264 |
| 10 | 0.503005 | 0.189780 | -0.045538 | -0.070940 | -0.269298 | 0.248859 | |
| 11 | 0.082916 | -0.048396 | -0.036694 | 0.014311 | -0.034964 | -0.007043 | |
| 12 | 0.079417 | -0.044509 | -0.030858 | 0.015558 | -0.033919 | 0.009589 | |
| 13 | 0.211297 | -0.120470 | -0.095485 | 0.067477 | -0.114600 | 0.044557 |
1182 rows × 6 columns
px.scatter(dcm, 0, 1,
color=chap_DOC.type,
size=chap_DOC.n_tokens, hover_name=chap_DOC.label,
height=1000, width=1200,
marginal_x='box', marginal_y='box')
# function to calculate the upper fence / bound in the box plots above for the different PCs
def upper_fence(df, books, pc):
pc_IQR = df.loc[books, pc].quantile(0.75) - df.loc[books, pc].quantile(0.25)
return 1.5 * pc_IQR + df.loc[books, pc].quantile(0.75)
# upper fences for pc 0
dickens_0_upper_fence = upper_fence(dcm, LIB.index.values, 0)
# upper fences for pc 1
dickens_1_upper_fence = upper_fence(dcm, LIB.index.values, 1)
# outliers the chapters in books with PC 0 or PC 1 greater than the max of the upper fence for dickens and twain for each PC
outliers = dcm.loc[(dcm[0] > dickens_0_upper_fence) | (dcm[1] > dickens_1_upper_fence)].index.values
# remove outliers from corpus
small_CORPUS = CORPUS.loc[~CORPUS.index.droplevel(['para_num', 'sent_num', 'token_num']) \
.isin(outliers)]
# remove outliers from vocab
small_VOCAB = VOCAB.loc[VOCAB.index.isin(small_CORPUS.term_str)]
# remove proper nounrs
proper_nouns = ['NNP', 'NNPS']
small_VOCAB = VOCAB.loc[~VOCAB.max_pos.isin(proper_nouns)]
# remove numbers
small_VOCAB = small_VOCAB.loc[~small_VOCAB.index.str.contains('[0-9]', regex = True)]
# remove ~15% of VOCAB data
(VOCAB.shape[0] - small_VOCAB.shape[0]) / VOCAB.shape[0]
0.15425531914893617
# remove proper nouns and numbers from corpus
small_CORPUS = small_CORPUS.loc[small_CORPUS.term_str.isin(small_VOCAB.index.values)]
# remove ~11% of data
(CORPUS.shape[0] - small_CORPUS.shape[0]) / CORPUS.shape[0]
0.10609553711053038
small_BOW = create_bow(small_CORPUS, CHAPS)
# suppress chained assignment warning
pd.options.mode.chained_assignment = None
small_DTCM, small_TFIDF, small_BOW, small_DFIDF, small_VOCAB = get_tfidf(small_BOW, small_VOCAB, tf_method = 'max', idf_method = 'standard')
small_chap_DOC = pd.DataFrame(index = small_TFIDF.index)
small_chap_DOC = small_chap_DOC.join(LIB[['author', 'title', 'type', 'decade']], on = 'book_id')
small_chap_DOC['label'] = small_chap_DOC.apply(lambda x: "{}-{}-{}".format(x.name[0], x.author, x.name[1]), 1)
small_chap_DOC['mean_tfidf'] = TFIDF.mean(1)
small_chap_DOC['n_tokens'] = small_BOW.groupby(OHCO[:2]).n.sum()
small_pca = pca.fit(small_TFIDF)
small_dcm = pca.transform(small_TFIDF)
px.scatter(small_dcm, 0, 1,
color=small_chap_DOC.type,
size=small_chap_DOC.n_tokens, hover_name=small_chap_DOC.label,
height=1000, width=1200,
marginal_x='box', marginal_y='box')
TFIDF_sigs)¶pca_sigs = pca.fit(TFIDF_sigs)
dcm_sigs = pca_sigs.transform(TFIDF_sigs)
dcm_sigs
| 0 | 1 | 2 | 3 | 4 | 5 | ||
|---|---|---|---|---|---|---|---|
| book_id | chap_id | ||||||
| 98 | 1 | 0.059308 | 0.027044 | -0.004335 | 0.002380 | -0.007692 | -0.010751 |
| 2 | 0.064570 | -0.009571 | -0.008245 | -0.023042 | 0.019243 | 0.009329 | |
| 3 | 0.083818 | 0.014368 | -0.013850 | -0.018862 | -0.002281 | 0.017569 | |
| 4 | 0.144831 | 0.012586 | -0.011949 | -0.017702 | -0.018918 | 0.001366 | |
| 5 | 0.099830 | -0.004260 | -0.004859 | -0.025189 | -0.001643 | 0.008036 | |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 35536 | 9 | 0.088392 | 0.011102 | -0.007005 | -0.002917 | -0.019529 | -0.001646 |
| 10 | 0.184596 | -0.012318 | 0.001723 | -0.034009 | 0.035516 | -0.121150 | |
| 11 | 0.072715 | 0.016359 | -0.004793 | -0.000527 | -0.023610 | 0.017840 | |
| 12 | 0.072537 | 0.022408 | -0.005806 | 0.002997 | -0.015566 | -0.019207 | |
| 13 | 0.107308 | 0.026861 | -0.012891 | -0.000272 | -0.013886 | -0.025155 |
1182 rows × 6 columns
px.scatter(dcm_sigs, 0, 1,
color=chap_DOC.type,
size=chap_DOC.n_tokens, hover_name=chap_DOC.label,
height=1000, width=1200,
marginal_x='box', marginal_y='box')
# save BOW for topic modeling
BOW.to_csv(f'dickens_BOW.csv')